diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-09-20 17:43:08 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-05-06 17:14:17 -0400 |
commit | 1654816f36166a77df46af48a974f01942be4a38 (patch) | |
tree | bf97c29944582c240b596751b32aec4c098bc068 | |
parent | 8b33578265e6fbb0712f688677bef2620861e019 (diff) |
Merge with dd444a83ea bcachefs: Drop unused arg to bch2_open_buckets_stop_dev()
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
56 files changed, 3306 insertions, 1916 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index da42c4fd764d..414ea2a74a5a 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -44,6 +44,7 @@ bcachefs-y := \ quota.o \ rebalance.o \ recovery.o \ + reflink.o \ replicas.o \ siphash.o \ super.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 43dc2f270dc6..9814179a6406 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -205,20 +205,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, get_alloc_field(a.v, &d, i)); } -static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bucket *g, struct bucket_mark m) -{ - return (struct bkey_alloc_unpacked) { - .gen = m.gen, - .oldest_gen = g->oldest_gen, - .data_type = m.data_type, - .dirty_sectors = m.dirty_sectors, - .cached_sectors = m.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - }; -} - int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { struct btree_trans trans; @@ -232,7 +218,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) - bch2_mark_key(c, k, 0, NULL, 0, + bch2_mark_key(c, k, 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); @@ -244,7 +230,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) for_each_journal_key(*journal_keys, j) if (j->btree_id == BTREE_ID_ALLOC) - bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(j->k), + 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); @@ -271,46 +258,68 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return 0; } -int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) +enum alloc_write_ret { + ALLOC_WROTE, + ALLOC_NOWROTE, + ALLOC_END, +}; + +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + unsigned flags) { - struct btree_trans trans; - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct bkey_s_c k; struct bch_dev *ca; + struct bucket_array *ba; + struct bucket *g; + struct bucket_mark m; + struct bkey_alloc_unpacked old_u, new_u; + __BKEY_PADDED(k, 8) alloc_key; /* hack: */ + struct bkey_i_alloc *a; int ret; +retry: + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; - if (k->k.p.inode >= c->sb.nr_devices || - !c->devs[k->k.p.inode]) - return 0; - - ca = bch_dev_bkey_exists(c, k->k.p.inode); + old_u = bch2_alloc_unpack(k); - if (k->k.p.offset >= ca->mi.nbuckets) - return 0; + if (iter->pos.inode >= c->sb.nr_devices || + !c->devs[iter->pos.inode]) + return ALLOC_END; - bch2_trans_init(&trans, c, 0, 0); + percpu_down_read(&c->mark_lock); + ca = bch_dev_bkey_exists(c, iter->pos.inode); + ba = bucket_array(ca); - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, - BTREE_ITER_INTENT); + if (iter->pos.offset >= ba->nbuckets) { + percpu_up_read(&c->mark_lock); + return ALLOC_END; + } - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto err; + g = &ba->b[iter->pos.offset]; + m = READ_ONCE(g->mark); + new_u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); - /* check buckets_written with btree node locked: */ - if (test_bit(k->k.p.offset, ca->buckets_written)) { - ret = 0; - goto err; - } + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return ALLOC_NOWROTE; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + bch2_alloc_pack(a, new_u); - ret = bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY| - BTREE_INSERT_NOMARK); + BTREE_INSERT_NOMARK| + flags); err: - bch2_trans_exit(&trans); + if (ret == -EINTR) + goto retry; return ret; } @@ -318,16 +327,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) { struct btree_trans trans; struct btree_iter *iter; - struct bucket_array *buckets; struct bch_dev *ca; - struct bucket *g; - struct bucket_mark m, new; - struct bkey_alloc_unpacked old_u, new_u; - __BKEY_PADDED(k, 8) alloc_key; /* hack: */ - struct bkey_i_alloc *a; - struct bkey_s_c k; unsigned i; - size_t b; int ret = 0; BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); @@ -338,81 +339,24 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); for_each_rw_member(ca, c, i) { - down_read(&ca->bucket_lock); -restart: - buckets = bucket_array(ca); + unsigned first_bucket; - for (b = buckets->first_bucket; - b < buckets->nbuckets; - b++) { - if (!buckets->b[b].mark.dirty) - continue; - - bch2_btree_iter_set_pos(iter, POS(i, b)); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; - - old_u = bch2_alloc_unpack(k); - - percpu_down_read(&c->mark_lock); - g = bucket(ca, b); - m = READ_ONCE(g->mark); - new_u = alloc_mem_to_key(g, m); - percpu_up_read(&c->mark_lock); - - if (!m.dirty) - continue; - - if ((flags & BTREE_INSERT_LAZY_RW) && - percpu_ref_is_zero(&c->writes)) { - up_read(&ca->bucket_lock); - bch2_trans_unlock(&trans); - - ret = bch2_fs_read_write_early(c); - down_read(&ca->bucket_lock); - - if (ret) - goto err; - goto restart; - } + percpu_down_read(&c->mark_lock); + first_bucket = bucket_array(ca)->first_bucket; + percpu_up_read(&c->mark_lock); - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, new_u); + bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOMARK| - flags); -err: - if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { - bch_err(c, "error %i writing alloc info", ret); - printk(KERN_CONT "dev %llu bucket %llu\n", - iter->pos.inode, iter->pos.offset); - printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen); -#define x(_name, _bits) printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name); - BCH_ALLOC_FIELDS() -#undef x - } - if (ret) + while (1) { + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret < 0 || ret == ALLOC_END) break; - - new = m; - new.dirty = false; - atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter); - - if (ca->buckets_written) - set_bit(b, ca->buckets_written); - - bch2_trans_cond_resched(&trans); - *wrote = true; + if (ret == ALLOC_WROTE) + *wrote = true; + bch2_btree_iter_next_slot(iter); } - up_read(&ca->bucket_lock); - if (ret) { + if (ret < 0) { percpu_ref_put(&ca->io_ref); break; } @@ -420,7 +364,27 @@ err: bch2_trans_exit(&trans); - return ret; + return ret < 0 ? ret : 0; +} + +int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + ret = bch2_alloc_write_key(&trans, iter, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } /* Bucket IO clocks: */ @@ -967,10 +931,6 @@ retry: if (!top->nr) heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - /* with btree still locked: */ - if (ca->buckets_written) - set_bit(b, ca->buckets_written); - /* * Make sure we flush the last journal entry that updated this * bucket (i.e. deleting the last reference) before writing to @@ -1204,7 +1164,7 @@ static int bch2_allocator_thread(void *arg) */ if (!nr || (nr < ALLOC_SCAN_BATCH(ca) && - !fifo_full(&ca->free[RESERVE_MOVINGGC]))) { + !fifo_empty(&ca->free[RESERVE_NONE]))) { ret = wait_buckets_available(c, ca); if (ret) { up_read(&c->gc_lock); diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 0c1a0f0dd2ab..501c444353fb 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -13,10 +13,35 @@ struct bkey_alloc_unpacked { #undef x }; +/* returns true if not equal */ +static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, + struct bkey_alloc_unpacked r) +{ + return l.gen != r.gen +#define x(_name, _bits) || l._name != r._name + BCH_ALLOC_FIELDS() +#undef x + ; +} + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); void bch2_alloc_pack(struct bkey_i_alloc *, const struct bkey_alloc_unpacked); +static inline struct bkey_alloc_unpacked +alloc_mem_to_key(struct bucket *g, struct bucket_mark m) +{ + return (struct bkey_alloc_unpacked) { + .gen = m.gen, + .oldest_gen = g->oldest_gen, + .data_type = m.data_type, + .dirty_sectors = m.dirty_sectors, + .cached_sectors = m.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + }; +} + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index e64f8449462f..697d576802b6 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -693,8 +693,7 @@ retry_blocking: } void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, - struct open_buckets *obs, - enum bch_data_type data_type) + struct open_buckets *obs) { struct open_buckets ptrs = { .nr = 0 }; struct open_bucket *ob, *ob2; @@ -725,7 +724,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, struct write_point *wp) { mutex_lock(&wp->lock); - bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type); + bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); mutex_unlock(&wp->lock); } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 6d8ffb0cd06d..687f973e4b3a 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -106,7 +106,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, - struct open_buckets *, enum bch_data_type); + struct open_buckets *); void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, struct write_point *); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 98c2fe734626..a186aa521049 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -283,9 +283,7 @@ do { \ "Force reads to use the reconstruct path, when reading" \ "from erasure coded extents") \ BCH_DEBUG_PARAM(test_restart_gc, \ - "Test restarting mark and sweep gc when bucket gens change")\ - BCH_DEBUG_PARAM(test_reconstruct_alloc, \ - "Test reconstructing the alloc btree") + "Test restarting mark and sweep gc when bucket gens change") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -359,6 +357,7 @@ enum gc_phase { GC_PHASE_BTREE_XATTRS, GC_PHASE_BTREE_ALLOC, GC_PHASE_BTREE_QUOTAS, + GC_PHASE_BTREE_REFLINK, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, @@ -409,7 +408,6 @@ struct bch_dev { */ struct bucket_array __rcu *buckets[2]; unsigned long *buckets_nouse; - unsigned long *buckets_written; struct rw_semaphore bucket_lock; struct bch_dev_usage __percpu *usage[2]; @@ -722,7 +720,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -740,12 +738,16 @@ struct bch_fs { /* ERASURE CODING */ struct list_head ec_new_stripe_list; struct mutex ec_new_stripe_lock; + u64 ec_stripe_hint; struct bio_set ec_bioset; struct work_struct ec_stripe_delete_work; struct llist_head ec_stripe_delete_list; + /* REFLINK */ + u64 reflink_hint; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 13285936dd2d..4577d77a9f38 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -336,7 +336,9 @@ static inline void bkey_init(struct bkey *k) x(xattr, 11) \ x(alloc, 12) \ x(quota, 13) \ - x(stripe, 14) + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -655,7 +657,7 @@ struct bch_reservation { /* Maximum possible size of an entire extent value: */ #define BKEY_EXTENT_VAL_U64s_MAX \ - (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) #define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) @@ -891,6 +893,24 @@ struct bch_stripe { struct bch_extent_ptr ptrs[0]; } __attribute__((packed, aligned(8))); +/* Reflink: */ + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + + __le32 reservation_generation; + __u8 nr_replicas; + __u8 pad[3]; +}; + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1293,6 +1313,7 @@ enum bch_sb_features { BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_EC = 4, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, + BCH_FEATURE_REFLINK = 6, BCH_FEATURE_NR, }; @@ -1480,7 +1501,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); x(XATTRS, 3, "xattrs") \ x(ALLOC, 4, "alloc") \ x(QUOTAS, 5, "quotas") \ - x(EC, 6, "erasure_coding") + x(EC, 6, "erasure_coding") \ + x(REFLINK, 7, "reflink") enum btree_id { #define x(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 1acff9d0fd7e..5ef66aed338d 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -50,7 +50,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); } -#define bkey_val_end(_k) vstruct_idx((_k).v, bkey_val_u64s((_k).k)) +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) @@ -552,6 +552,8 @@ BKEY_VAL_ACCESSORS(xattr); BKEY_VAL_ACCESSORS(alloc); BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(stripe); +BKEY_VAL_ACCESSORS(reflink_p); +BKEY_VAL_ACCESSORS(reflink_v); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 27f196ef0b18..f01405dd502b 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -10,9 +10,10 @@ #include "extents.h" #include "inode.h" #include "quota.h" +#include "reflink.h" #include "xattr.h" -const char * const bch_bkey_types[] = { +const char * const bch2_bkey_types[] = { #define x(name, nr) #name, BCH_BKEY_TYPES() #undef x @@ -144,7 +145,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) } if (ops->key_debugcheck) - ops->key_debugcheck(c, b, k); + ops->key_debugcheck(c, k); } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) @@ -159,7 +160,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { - pr_buf(out, "u64s %u type %u ", k->u64s, k->type); + pr_buf(out, "u64s %u type %s ", k->u64s, + bch2_bkey_types[k->type]); bch2_bpos_to_text(out, k->p); @@ -174,8 +176,6 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, if (likely(ops->val_to_text)) ops->val_to_text(out, c, k); - else - pr_buf(out, " %s", bch_bkey_types[k.k->type]); } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 08b976633360..8568b65c1ed2 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -9,7 +9,7 @@ struct btree; struct bkey; enum btree_node_type; -extern const char * const bch_bkey_types[]; +extern const char * const bch2_bkey_types[]; enum merge_result { BCH_MERGE_NOMERGE, @@ -26,8 +26,7 @@ struct bkey_ops { /* Returns reason for being invalid if invalid, else NULL: */ const char * (*key_invalid)(const struct bch_fs *, struct bkey_s_c); - void (*key_debugcheck)(struct bch_fs *, struct btree *, - struct bkey_s_c); + void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(const struct bkey_format *, struct bkey_packed *); diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index 9f5d9b4bf1c9..e32fad5a91ac 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -415,25 +415,22 @@ bch2_sort_repack_merge(struct bch_fs *c, struct bkey_format *out_f, bool filter_whiteouts) { - struct bkey_packed *prev = NULL, *k_packed, *next; - struct bkey k_unpacked; + struct bkey_packed *prev = NULL, *k_packed; struct bkey_s k; struct btree_nr_keys nr; + BKEY_PADDED(k) tmp; memset(&nr, 0, sizeof(nr)); - next = bch2_btree_node_iter_next_all(iter, src); - while ((k_packed = next)) { - /* - * The filter might modify the size of @k's value, so advance - * the iterator first: - */ - next = bch2_btree_node_iter_next_all(iter, src); - + while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { if (filter_whiteouts && bkey_whiteout(k_packed)) continue; - k = __bkey_disassemble(src, k_packed, &k_unpacked); + EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) > + BKEY_EXTENT_VAL_U64s_MAX); + + bch2_bkey_unpack(src, &tmp.k, k_packed); + k = bkey_i_to_s(&tmp.k); if (filter_whiteouts && bch2_bkey_normalize(c, k)) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index ef10e77ec1e5..32436ed5cc80 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -24,6 +24,16 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); +static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) +{ + unsigned n = ARRAY_SIZE(iter->data); + + while (n && __btree_node_iter_set_end(iter, n - 1)) + --n; + + return n; +} + struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) { unsigned offset = __btree_node_key_to_offset(b, k); @@ -110,7 +120,8 @@ void bch2_dump_btree_node_iter(struct btree *b, { struct btree_node_iter_set *set; - printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets); + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); btree_node_iter_for_each(iter, set) { struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); @@ -119,8 +130,8 @@ void bch2_dump_btree_node_iter(struct btree *b, char buf[100]; bch2_bkey_to_text(&PBUF(buf), &uk); - printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set, - k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf); + printk(KERN_ERR "set %zu key %u: %s\n", + t - b->set, set->k, buf); } } @@ -182,8 +193,12 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, struct btree *b) { struct btree_node_iter_set *set, *s2; + struct bkey_packed *k, *p; struct bset_tree *t; + if (bch2_btree_node_iter_end(iter)) + return; + /* Verify no duplicates: */ btree_node_iter_for_each(iter, set) btree_node_iter_for_each(iter, s2) @@ -204,6 +219,18 @@ found: btree_node_iter_for_each(iter, set) BUG_ON(set != iter->data && btree_node_iter_cmp(b, set[-1], set[0]) > 0); + + k = bch2_btree_node_iter_peek_all(iter, b); + + for_each_bset(b, t) { + if (iter->data[0].end == t->end_offset) + continue; + + p = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + + BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); + } } void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, @@ -1669,25 +1696,13 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter, __bch2_btree_node_iter_advance(iter, b); } -static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -{ - unsigned n = ARRAY_SIZE(iter->data); - - while (n && __btree_node_iter_set_end(iter, n - 1)) - --n; - - return n; -} - /* * Expensive: */ -struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, - struct btree *b, - unsigned min_key_type) +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct btree *b) { struct bkey_packed *k, *prev = NULL; - struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b); struct btree_node_iter_set *set; struct bset_tree *t; unsigned end = 0; @@ -1695,9 +1710,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { - k = bch2_bkey_prev_filter(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t), - min_key_type); + k = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); if (k && (!prev || bkey_iter_cmp(b, k, prev) > 0)) { prev = k; @@ -1706,7 +1720,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite } if (!prev) - goto out; + return NULL; /* * We're manually memmoving instead of just calling sort() to ensure the @@ -1727,18 +1741,20 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; -out: - if (btree_keys_expensive_checks(b)) { - struct btree_node_iter iter2 = *iter; - if (prev) - __bch2_btree_node_iter_advance(&iter2, b); + bch2_btree_node_iter_verify(iter, b); + return prev; +} - while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) { - BUG_ON(k->type >= min_key_type); - __bch2_btree_node_iter_advance(&iter2, b); - } - } +struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, + struct btree *b, + unsigned min_key_type) +{ + struct bkey_packed *prev; + + do { + prev = bch2_btree_node_iter_prev_all(iter, b); + } while (prev && prev->type < min_key_type); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 17c239947300..643bd9e8bc4d 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -528,16 +528,12 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) return ret; } +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, + struct btree *); struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, struct btree *, unsigned); static inline struct bkey_packed * -bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b) -{ - return bch2_btree_node_iter_prev_filter(iter, b, 0); -} - -static inline struct bkey_packed * bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) { return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index a458cfe0e92d..f4adb07a3de2 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -142,20 +142,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, struct bucket *g2 = PTR_BUCKET(ca, ptr, false); if (mustfix_fsck_err_on(!g->gen_valid, c, - "found ptr with missing gen in alloc btree,\n" - "type %u gen %u", - k.k->type, ptr->gen)) { + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k.k, ptr)], + ptr->gen)) { g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->_mark.dirty = g->_mark.dirty = true; g2->gen_valid = g->gen_valid = true; } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, - "%u ptr gen in the future: %u > %u", - k.k->type, ptr->gen, g->mark.gen)) { + "bucket %u:%zu data type %s ptr gen in the future: %u > %u", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k.k, ptr)], + ptr->gen, g->mark.gen)) { g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->_mark.dirty = g->_mark.dirty = true; g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; set_bit(BCH_FS_FIXED_GENS, &c->flags); } } @@ -171,7 +175,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, k, k.k->size, NULL, 0, flags); + bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); fsck_err: return ret; } @@ -418,7 +422,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(&d->key), + 0, 0, NULL, 0, BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); @@ -525,7 +530,6 @@ static int bch2_gc_done(struct bch_fs *c, ": got %u, should be %u", i, b, \ dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ - dst->b[b]._mark.dirty = true; \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) @@ -577,10 +581,7 @@ static int bch2_gc_done(struct bch_fs *c, copy_bucket_field(dirty_sectors); copy_bucket_field(cached_sectors); - if (dst->b[b].oldest_gen != src->b[b].oldest_gen) { - dst->b[b].oldest_gen = src->b[b].oldest_gen; - dst->b[b]._mark.dirty = true; - } + dst->b[b].oldest_gen = src->b[b].oldest_gen; } }; @@ -761,6 +762,8 @@ out: percpu_down_write(&c->mark_lock); bch2_gc_free(c); percpu_up_write(&c->mark_lock); + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); goto again; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 5652f354b910..b6e286c36b86 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -510,7 +510,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -1038,10 +1038,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, INIT_WORK(&rb->work, btree_node_read_work); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); bio->bi_end_io = btree_node_read_endio; bio->bi_private = b; - bch2_bio_map(bio, b->data); + bch2_bio_map(bio, b->data, btree_bytes(c)); set_btree_node_read_in_flight(b); @@ -1502,11 +1501,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio->wbio.order = order; wbio->wbio.used_mempool = used_mempool; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; - wbio->wbio.bio.bi_iter.bi_size = sectors_to_write << 9; wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; - bch2_bio_map(&wbio->wbio.bio, data); + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); /* * If we're appending to a leaf node, we don't technically need FUA - diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 8955555d6603..40cd87d73a4f 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -86,7 +86,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) struct btree_iter *linked; unsigned readers = 0; - EBUG_ON(btree_node_read_locked(iter, b->level)); + EBUG_ON(!btree_node_intent_locked(iter, b->level)); trans_for_each_iter(iter->trans, linked) if (linked->l[b->level].b == b && @@ -496,6 +496,23 @@ static inline void __bch2_btree_iter_verify(struct btree_iter *iter, #endif +static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t, + struct bkey_packed *k) +{ + struct btree_node_iter_set *set; + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) { + set->k = __btree_node_key_to_offset(b, k); + bch2_btree_node_iter_sort(iter, b); + return; + } + + bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); +} + static void __bch2_btree_node_iter_fix(struct btree_iter *iter, struct btree *b, struct btree_node_iter *node_iter, @@ -509,6 +526,10 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, unsigned offset = __btree_node_key_to_offset(b, where); int shift = new_u64s - clobber_u64s; unsigned old_end = t->end_offset - shift; + unsigned orig_iter_pos = node_iter->data[0].k; + bool iter_current_key_modified = + orig_iter_pos >= offset && + orig_iter_pos <= offset + clobber_u64s; btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -517,17 +538,12 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && btree_iter_pos_cmp(iter, b, where) > 0) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - bch2_btree_node_iter_push(node_iter, b, where, end); - - if (!b->level && - node_iter == &iter->l[0].iter) - bkey_disassemble(b, - bch2_btree_node_iter_peek_all(node_iter, b), - &iter->k); + goto fixup_done; + } else { + /* Iterator is after key that changed */ + return; } - return; found: set->end = t->end_offset; @@ -543,85 +559,66 @@ found: if (set->k == set->end) bch2_btree_node_iter_set_drop(node_iter, set); } else { + /* Iterator is after key that changed */ set->k = (int) set->k + shift; - goto iter_current_key_not_modified; + return; } - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - bch2_btree_node_iter_sort(node_iter, b); - if (!b->level && node_iter == &iter->l[0].iter) { - /* - * not legal to call bkey_debugcheck() here, because we're - * called midway through the update path after update has been - * marked but before deletes have actually happened: - */ -#if 0 - __btree_iter_peek_all(iter, &iter->l[0], &iter->k); -#endif - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_peek_all(&l->iter, l->b); - - if (unlikely(!k)) - iter->k.type = KEY_TYPE_deleted; - else - bkey_disassemble(l->b, k, &iter->k); - } -iter_current_key_not_modified: +fixup_done: + if (node_iter->data[0].k != orig_iter_pos) + iter_current_key_modified = true; /* - * Interior nodes are special because iterators for interior nodes don't - * obey the usual invariants regarding the iterator position: - * - * We may have whiteouts that compare greater than the iterator - * position, and logically should be in the iterator, but that we - * skipped past to find the first live key greater than the iterator - * position. This becomes an issue when we insert a new key that is - * greater than the current iterator position, but smaller than the - * whiteouts we've already skipped past - this happens in the course of - * a btree split. - * - * We have to rewind the iterator past to before those whiteouts here, - * else bkey_node_iter_prev() is not going to work and who knows what - * else would happen. And we have to do it manually, because here we've - * already done the insert and the iterator is currently inconsistent: - * - * We've got multiple competing invariants, here - we have to be careful - * about rewinding iterators for interior nodes, because they should - * always point to the key for the child node the btree iterator points - * to. + * When a new key is added, and the node iterator now points to that + * key, the iterator might have skipped past deleted keys that should + * come after the key the iterator now points to. We have to rewind to + * before those deleted keys - otherwise + * bch2_btree_node_iter_prev_all() breaks: */ - if (b->level && new_u64s && - btree_iter_pos_cmp(iter, b, where) > 0) { + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && + (b->level || + (iter->flags & BTREE_ITER_IS_EXTENTS))) { struct bset_tree *t; - struct bkey_packed *k; + struct bkey_packed *k, *k2, *p; + + k = bch2_btree_node_iter_peek_all(node_iter, b); for_each_bset(b, t) { - if (bch2_bkey_to_bset(b, where) == t) + bool set_pos = false; + + if (node_iter->data[0].end == t->end_offset) continue; - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(node_iter, b, t)); - if (k && - bkey_iter_cmp(b, k, where) > 0) { - struct btree_node_iter_set *set; - unsigned offset = - __btree_node_key_to_offset(b, bkey_next(k)); - - btree_node_iter_for_each(node_iter, set) - if (set->k == offset) { - set->k = __btree_node_key_to_offset(b, k); - bch2_btree_node_iter_sort(node_iter, b); - goto next_bset; - } - - bch2_btree_node_iter_push(node_iter, b, k, - btree_bkey_last(b, t)); + k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); + + while ((p = bch2_bkey_prev_all(b, t, k2)) && + bkey_iter_cmp(b, k, p) < 0) { + k2 = p; + set_pos = true; } -next_bset: - t = t; + + if (set_pos) + btree_node_iter_set_set_pos(node_iter, + b, t, k2); + } + } + + if (!b->level && + node_iter == &iter->l[0].iter && + iter_current_key_modified) { + struct bkey_packed *k = + bch2_btree_node_iter_peek_all(node_iter, b); + + if (likely(k)) { + bkey_disassemble(b, k, &iter->k); + } else { + /* XXX: for extents, calculate size of hole? */ + iter->k.type = KEY_TYPE_deleted; } + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } } @@ -635,14 +632,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, struct bset_tree *t = bch2_bkey_to_bset(b, where); struct btree_iter *linked; - if (node_iter != &iter->l[b->level].iter) + if (node_iter != &iter->l[b->level].iter) { __bch2_btree_node_iter_fix(iter, b, node_iter, t, - where, clobber_u64s, new_u64s); + where, clobber_u64s, new_u64s); + bch2_btree_node_iter_verify(node_iter, b); + } - trans_for_each_iter_with_node(iter->trans, b, linked) + trans_for_each_iter_with_node(iter->trans, b, linked) { __bch2_btree_node_iter_fix(linked, b, - &linked->l[b->level].iter, t, - where, clobber_u64s, new_u64s); + &linked->l[b->level].iter, t, + where, clobber_u64s, new_u64s); + __bch2_btree_iter_verify(linked, b); + } } static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, @@ -685,6 +686,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bch2_btree_node_iter_peek(&l->iter, l->b)); } +static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, + struct btree_iter_level *l) +{ + return __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_prev(&l->iter, l->b)); +} + static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, struct btree_iter_level *l, int max_advance) @@ -743,18 +751,29 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) btree_node_unlock(iter, b->level + 1); } +static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + struct btree *b) +{ + return bkey_cmp(iter->pos, b->data->min_key) < 0; +} + static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return __btree_iter_pos_cmp(iter, NULL, - bkey_to_packed(&b->key), true) < 0; + int cmp = bkey_cmp(b->key.k.p, iter->pos); + + if (!cmp && + (iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(b->key.k.p, POS_MAX)) + cmp = -1; + return cmp < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, struct btree *b) { return iter->btree_id == b->btree_id && - bkey_cmp(iter->pos, b->data->min_key) >= 0 && + !btree_iter_pos_before_node(iter, b) && !btree_iter_pos_after_node(iter, b); } @@ -956,10 +975,10 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } -int __must_check __bch2_btree_iter_traverse(struct btree_iter *); +static int btree_iter_traverse_one(struct btree_iter *); static int __btree_iter_traverse_all(struct btree_trans *trans, - struct btree_iter *orig_iter, int ret) + struct btree_iter *orig_iter, int ret) { struct bch_fs *c = trans->c; struct btree_iter *iter; @@ -1003,7 +1022,7 @@ retry_all: iter = &trans->iters[sorted[i]]; do { - ret = __bch2_btree_iter_traverse(iter); + ret = btree_iter_traverse_one(iter); } while (ret == -EINTR); if (ret) @@ -1021,16 +1040,27 @@ int bch2_btree_iter_traverse_all(struct btree_trans *trans) return __btree_iter_traverse_all(trans, NULL, 0); } -static unsigned btree_iter_up_until_locked(struct btree_iter *iter, - bool check_pos) +static inline bool btree_iter_good_node(struct btree_iter *iter, + unsigned l, int check_pos) +{ + if (!is_btree_node(iter, l) || + !bch2_btree_node_relock(iter, l)) + return false; + + if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) + return false; + if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) + return false; + return true; +} + +static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, + int check_pos) { unsigned l = iter->level; while (btree_iter_node(iter, l) && - (!is_btree_node(iter, l) || - !bch2_btree_node_relock(iter, l) || - (check_pos && - !btree_iter_pos_in_node(iter, iter->l[l].b)))) { + !btree_iter_good_node(iter, l, check_pos)) { btree_node_unlock(iter, l); iter->l[l].b = BTREE_ITER_NO_NODE_UP; l++; @@ -1048,7 +1078,7 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) +static int btree_iter_traverse_one(struct btree_iter *iter) { unsigned depth_want = iter->level; @@ -1062,7 +1092,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos * here unnecessary */ - iter->level = btree_iter_up_until_locked(iter, true); + iter->level = btree_iter_up_until_good_node(iter, 0); /* * If we've got a btree node locked (i.e. we aren't about to relock the @@ -1070,8 +1100,11 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary */ - if (btree_iter_node(iter, iter->level)) + if (btree_iter_node(iter, iter->level)) { + BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); + btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); + } /* * Note: iter->nodes[iter->level] may be temporarily NULL here - that @@ -1100,12 +1133,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) return 0; } -int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; ret = bch2_trans_cond_resched(iter->trans) ?: - __bch2_btree_iter_traverse(iter); + btree_iter_traverse_one(iter); if (unlikely(ret)) ret = __btree_iter_traverse_all(iter->trans, iter, ret); @@ -1234,19 +1267,11 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); } -void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) { - int cmp = bkey_cmp(new_pos, iter->pos); - unsigned level; - - if (!cmp) - return; + unsigned l = btree_iter_up_until_good_node(iter, cmp); - iter->pos = new_pos; - - level = btree_iter_up_until_locked(iter, true); - - if (btree_iter_node(iter, level)) { + if (btree_iter_node(iter, l)) { /* * We might have to skip over many keys, or just a few: try * advancing the node iterator, and if we have to skip over too @@ -1254,37 +1279,98 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) * is expensive). */ if (cmp < 0 || - !btree_iter_advance_to_pos(iter, &iter->l[level], 8)) - __btree_iter_init(iter, level); + !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) + __btree_iter_init(iter, l); /* Don't leave it locked if we're not supposed to: */ - if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level); + if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, l); } - if (level != iter->level) + return l; +} + +void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + int cmp = bkey_cmp(new_pos, iter->pos); + unsigned l; + + if (!cmp) + return; + + iter->pos = new_pos; + + l = btree_iter_pos_changed(iter, cmp); + + if (l != iter->level) btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); else btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } +static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + + iter->pos = l->b->key.k.p; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MAX)) { + bkey_init(&iter->k); + iter->k.p = POS_MAX; + return false; + } + + iter->pos = btree_type_successor(iter->btree_id, iter->pos); + btree_iter_pos_changed(iter, 1); + return true; +} + +static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + + iter->pos = l->b->data->min_key; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MIN)) { + bkey_init(&iter->k); + iter->k.p = POS_MIN; + return false; + } + + iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); + btree_iter_pos_changed(iter, -1); + return true; +} + static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c ret = { .k = &iter->k }; if (!bkey_deleted(&iter->k)) { - EBUG_ON(bch2_btree_node_iter_end(&l->iter)); - ret.v = bkeyp_val(&l->b->format, - __bch2_btree_node_iter_peek_all(&l->iter, l->b)); + struct bkey_packed *_k = + __bch2_btree_node_iter_peek_all(&l->iter, l->b); + + ret.v = bkeyp_val(&l->b->format, _k); + + if (debug_check_iterators(iter->trans->c)) { + struct bkey k = bkey_unpack_key(l->b, _k); + BUG_ON(memcmp(&k, &iter->k, sizeof(k))); + } + + if (debug_check_bkeys(iter->trans->c)) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); } - if (debug_check_bkeys(iter->trans->c) && - !bkey_deleted(ret.k)) - bch2_bkey_debugcheck(iter->trans->c, l->b, ret); return ret; } +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; @@ -1297,24 +1383,16 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) return btree_iter_peek_uptodate(iter); while (1) { - if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - } + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); k = __btree_iter_peek(iter, l); if (likely(k.k)) break; - /* got to the end of the leaf, iterator needs to be traversed: */ - iter->pos = l->b->key.k.p; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - if (!bkey_cmp(iter->pos, POS_MAX)) + if (!btree_iter_set_pos_to_next_leaf(iter)) return bkey_s_c_null; - - iter->pos = btree_type_successor(iter->btree_id, iter->pos); } /* @@ -1329,22 +1407,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) return k; } -static noinline -struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) -{ - struct btree_iter_level *l = &iter->l[0]; - - iter->pos = l->b->key.k.p; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - if (!bkey_cmp(iter->pos, POS_MAX)) - return bkey_s_c_null; - - iter->pos = btree_type_successor(iter->btree_id, iter->pos); - - return bch2_btree_iter_peek(iter); -} - +/** + * bch2_btree_iter_next: returns first key greater than iterator's current + * position + */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; @@ -1353,15 +1419,19 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - iter->pos = btree_type_successor(iter->btree_id, iter->k.p); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; + /* * XXX: when we just need to relock we should be able to avoid * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK * for that to work */ - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + bch2_btree_iter_set_pos(iter, + btree_type_successor(iter->btree_id, iter->k.p)); return bch2_btree_iter_peek(iter); } @@ -1369,9 +1439,12 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) do { bch2_btree_node_iter_advance(&l->iter, l->b); p = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (unlikely(!p)) - return bch2_btree_iter_peek_next_leaf(iter); - } while (bkey_whiteout(p)); + } while (likely(p) && bkey_whiteout(p)); + + if (unlikely(!p)) + return btree_iter_set_pos_to_next_leaf(iter) + ? bch2_btree_iter_peek(iter) + : bkey_s_c_null; k = __btree_iter_unpack(iter, l, &iter->k, p); @@ -1380,51 +1453,79 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return k; } -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +/** + * bch2_btree_iter_peek_prev: returns first key less than or equal to + * iterator's current position + */ +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *p; struct bkey_s_c k; int ret; bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - k = bch2_btree_iter_peek(iter); - if (IS_ERR(k.k)) - return k; - } + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); while (1) { - p = bch2_btree_node_iter_prev(&l->iter, l->b); - if (likely(p)) - break; - - iter->pos = l->b->data->min_key; - if (!bkey_cmp(iter->pos, POS_MIN)) - return bkey_s_c_null; - - bch2_btree_iter_set_pos(iter, - btree_type_predecessor(iter->btree_id, iter->pos)); - ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); - p = bch2_btree_node_iter_peek(&l->iter, l->b); - if (p) + k = __btree_iter_peek(iter, l); + if (!k.k || + bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + k = __btree_iter_prev(iter, l); + + if (likely(k.k)) break; - } - k = __btree_iter_unpack(iter, l, &iter->k, p); + if (!btree_iter_set_pos_to_prev_leaf(iter)) + return bkey_s_c_null; + } EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); - iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; return k; } +/** + * bch2_btree_iter_prev: returns first key less than iterator's current + * position + */ +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + /* + * XXX: when we just need to relock we should be able to avoid + * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK + * for that to work + */ + iter->pos = btree_type_predecessor(iter->btree_id, + iter->pos); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + return bch2_btree_iter_peek_prev(iter); + } + + k = __btree_iter_prev(iter, l); + if (unlikely(!k.k)) + return btree_iter_set_pos_to_prev_leaf(iter) + ? bch2_btree_iter_peek(iter) + : bkey_s_c_null; + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0); + iter->pos = bkey_start_pos(k.k); + return k; +} + static inline struct bkey_s_c __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) { @@ -1436,8 +1537,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) recheck: while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && - bkey_deleted(k.k) && - bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) + bkey_cmp(k.k->p, iter->pos) <= 0) bch2_btree_node_iter_advance(&l->iter, l->b); /* @@ -1477,6 +1577,8 @@ recheck: EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); EBUG_ON(bkey_deleted(k.k)); iter->uptodate = BTREE_ITER_UPTODATE; + + __bch2_btree_iter_verify(iter, l->b); return k; } @@ -1507,6 +1609,8 @@ recheck: iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; + + __bch2_btree_iter_verify(iter, l->b); return (struct bkey_s_c) { &iter->k, NULL }; } @@ -1539,19 +1643,18 @@ recheck: goto recheck; } - if (k.k && - !bkey_deleted(k.k) && - !bkey_cmp(iter->pos, k.k->p)) { - iter->uptodate = BTREE_ITER_UPTODATE; - return k; - } else { + if (!k.k || + bkey_deleted(k.k) || + bkey_cmp(iter->pos, k.k->p)) { /* hole */ bkey_init(&iter->k); iter->k.p = iter->pos; - - iter->uptodate = BTREE_ITER_UPTODATE; - return (struct bkey_s_c) { &iter->k, NULL }; + k = (struct bkey_s_c) { &iter->k, NULL }; } + + iter->uptodate = BTREE_ITER_UPTODATE; + __bch2_btree_iter_verify(iter, l->b); + return k; } struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) @@ -1563,11 +1666,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->uptodate == BTREE_ITER_UPTODATE) return btree_iter_peek_uptodate(iter); - if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - } + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); return __bch2_btree_iter_peek_slot(iter); } @@ -1669,7 +1770,10 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans, static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { - void *new_iters, *new_updates; + void *new_iters, *new_updates, *new_sorted; + size_t iters_bytes; + size_t updates_bytes; + size_t sorted_bytes; new_size = roundup_pow_of_two(new_size); @@ -1682,9 +1786,13 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, bch2_trans_unlock(trans); - new_iters = kmalloc(sizeof(struct btree_iter) * new_size + - sizeof(struct btree_insert_entry) * (new_size + 4), - GFP_NOFS); + iters_bytes = sizeof(struct btree_iter) * new_size; + updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4); + sorted_bytes = sizeof(u8) * (new_size + 4); + + new_iters = kmalloc(iters_bytes + + updates_bytes + + sorted_bytes, GFP_NOFS); if (new_iters) goto success; @@ -1693,7 +1801,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, trans->used_mempool = true; success: - new_updates = new_iters + sizeof(struct btree_iter) * new_size; + new_updates = new_iters + iters_bytes; + new_sorted = new_updates + updates_bytes; memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); @@ -1708,9 +1817,10 @@ success: if (trans->iters != trans->iters_onstack) kfree(trans->iters); - trans->iters = new_iters; - trans->updates = new_updates; - trans->size = new_size; + trans->iters = new_iters; + trans->updates = new_updates; + trans->updates_sorted = new_sorted; + trans->size = new_size; if (trans->iters_live) { trace_trans_restart_iters_realloced(trans->ip, trans->size); @@ -1779,6 +1889,12 @@ found: iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + + if ((iter->flags & BTREE_ITER_INTENT) && + !bch2_btree_iter_upgrade(iter, 1)) { + trace_trans_restart_upgrade(trans->ip); + return ERR_PTR(-EINTR); + } } BUG_ON(iter->btree_id != btree_id); @@ -1949,6 +2065,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; + trans->updates_sorted = trans->updates_sorted_onstack; trans->fs_usage_deltas = NULL; if (expected_nr_iters > trans->size) @@ -1973,3 +2090,18 @@ int bch2_trans_exit(struct btree_trans *trans) return trans->error ? -EIO : 0; } + +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ + mempool_exit(&c->btree_iters_pool); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + unsigned nr = BTREE_ITER_MAX; + + return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * (nr + 4) + + sizeof(u8) * (nr + 4)); +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 9483ec8913e3..e4967215e1d9 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -134,7 +134,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); -int __must_check bch2_btree_iter_traverse(struct btree_iter *); +int __must_check __bch2_btree_iter_traverse(struct btree_iter *); + +static inline int __must_check +bch2_btree_iter_traverse(struct btree_iter *iter) +{ + return iter->uptodate >= BTREE_ITER_NEED_RELOCK + ? __bch2_btree_iter_traverse(iter) + : 0; +} + int bch2_btree_iter_traverse_all(struct btree_trans *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); @@ -142,6 +151,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); @@ -242,7 +253,7 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, (_start), (_flags))) ?: \ PTR_ERR_OR_ZERO(((_k) = \ __bch2_btree_iter_peek(_iter, _flags)).k); \ - !ret && (_k).k; \ + !_ret && (_k).k; \ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ __bch2_btree_iter_next(_iter, _flags)).k)) @@ -303,4 +314,7 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t); void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); int bch2_trans_exit(struct btree_trans *); +void bch2_fs_btree_iter_exit(struct bch_fs *); +int bch2_fs_btree_iter_init(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 91aa30a6ed2f..b0da09630911 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -261,8 +261,6 @@ struct btree_insert_entry { }; bool deferred; - bool triggered; - bool marked; }; #define BTREE_ITER_MAX 64 @@ -291,6 +289,7 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; + u8 *updates_sorted; /* update path: */ struct journal_res journal_res; @@ -302,6 +301,7 @@ struct btree_trans { struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[6]; + u8 updates_sorted_onstack[6]; struct replicas_delta_list *fs_usage_deltas; }; @@ -461,7 +461,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return type == BKEY_TYPE_EXTENTS; + switch (type) { + case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_REFLINK: + return true; + default: + return false; + } } static inline bool btree_node_is_extents(struct btree *b) @@ -477,6 +483,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) case BKEY_TYPE_EXTENTS: case BKEY_TYPE_INODES: case BKEY_TYPE_EC: + case BKEY_TYPE_REFLINK: return true; default: return false; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 616c103c05ec..36e34b3d9213 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -43,7 +43,6 @@ enum { __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RESERVED, - __BTREE_INSERT_NOMARK_INSERT, __BTREE_INSERT_NOMARK_OVERWRITES, __BTREE_INSERT_NOMARK, __BTREE_INSERT_MARK_INMEM, @@ -81,9 +80,6 @@ enum { #define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -/* Don't mark new key, just overwrites: */ -#define BTREE_INSERT_NOMARK_INSERT (1 << __BTREE_INSERT_NOMARK_INSERT) - /* Don't mark overwrites, just new key: */ #define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) @@ -123,8 +119,13 @@ int bch2_trans_commit(struct btree_trans *, struct disk_reservation *, u64 *, unsigned); -struct btree_insert_entry *bch2_trans_update(struct btree_trans *, - struct btree_insert_entry); +static inline void bch2_trans_update(struct btree_trans *trans, + struct btree_insert_entry entry) +{ + EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); + + trans->updates[trans->nr_updates++] = entry; +} #define bch2_trans_do(_c, _journal_seq, _flags, _do) \ ({ \ @@ -144,18 +145,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *, _ret; \ }) -/* - * We sort transaction entries so that if multiple iterators point to the same - * leaf node they'll be adjacent: - */ -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - !i->deferred && - i[0].iter->l[0].b == i[-1].iter->l[0].b; -} - #define __trans_next_update(_trans, _i, _filter) \ ({ \ while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\ @@ -175,8 +164,4 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, #define trans_for_each_update_iter(trans, i) \ __trans_for_each_update(trans, i, !(i)->deferred) -#define trans_for_each_update_leaf(trans, i) \ - __trans_for_each_update(trans, i, !(i)->deferred && \ - !same_leaf_as_prev(trans, i)) - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 9294137719df..6813eddd26f5 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -194,7 +194,7 @@ found: : gc_pos_btree_root(as->btree_id)) >= 0 && gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE| BCH_BUCKET_MARK_GC); } @@ -266,11 +266,12 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, { BUG_ON(!pending->index_update_done); - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, - BCH_BUCKET_MARK_OVERWRITE); + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE); if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE| BCH_BUCKET_MARK_GC); } @@ -1077,11 +1078,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - 0, fs_usage, 0, + 0, 0, fs_usage, 0, BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_INSERT| BCH_BUCKET_MARK_GC); @@ -1175,12 +1176,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - 0, fs_usage, 0, + 0, 0, fs_usage, 0, BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_node(b))) bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_INSERT| BCH_BUCKET_MARK_GC); @@ -2003,11 +2004,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - 0, fs_usage, 0, + 0, 0, fs_usage, 0, BCH_BUCKET_MARK_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - 0, NULL, 0, + 0, 0, NULL, 0, BCH_BUCKET_MARK_INSERT|| BCH_BUCKET_MARK_GC); diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 4f12108bd6fe..0d32fb8726c7 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -19,6 +19,26 @@ #include <linux/sort.h> #include <trace/events/bcachefs.h> +static inline bool same_leaf_as_prev(struct btree_trans *trans, + unsigned sorted_idx) +{ + struct btree_insert_entry *i = trans->updates + + trans->updates_sorted[sorted_idx]; + struct btree_insert_entry *prev = sorted_idx + ? trans->updates + trans->updates_sorted[sorted_idx - 1] + : NULL; + + return !i->deferred && + prev && + i->iter->l[0].b == prev->iter->l[0].b; +} + +#define trans_for_each_update_sorted(_trans, _i, _iter) \ + for (_iter = 0; \ + _iter < _trans->nr_updates && \ + (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \ + _iter++) + inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { @@ -36,20 +56,21 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, bch2_btree_init_next(c, b, iter); } -static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans) +static void btree_trans_lock_write(struct btree_trans *trans, bool lock) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i; + unsigned iter; - trans_for_each_update_leaf(trans, i) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); -} - -static void btree_trans_unlock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; + trans_for_each_update_sorted(trans, i, iter) { + if (same_leaf_as_prev(trans, iter)) + continue; - trans_for_each_update_leaf(trans, i) - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); + if (lock) + bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); + else + bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); + } } static inline int btree_trans_cmp(struct btree_insert_entry l, @@ -59,6 +80,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l, btree_iter_cmp(l.iter, r.iter); } +static inline void btree_trans_sort_updates(struct btree_trans *trans) +{ + struct btree_insert_entry *l, *r; + unsigned nr = 0, pos; + + trans_for_each_update(trans, l) { + for (pos = 0; pos < nr; pos++) { + r = trans->updates + trans->updates_sorted[pos]; + + if (btree_trans_cmp(*l, *r) <= 0) + break; + } + + memmove(&trans->updates_sorted[pos + 1], + &trans->updates_sorted[pos], + (nr - pos) * sizeof(trans->updates_sorted[0])); + + trans->updates_sorted[pos] = l - trans->updates; + nr++; + } + + BUG_ON(nr != trans->nr_updates); +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -106,7 +151,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, bch2_bset_delete(b, k, clobber_u64s); bch2_btree_node_iter_fix(iter, b, node_iter, k, clobber_u64s, 0); - bch2_btree_iter_verify(iter, b); return true; } @@ -116,7 +160,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, k->type = KEY_TYPE_deleted; bch2_btree_node_iter_fix(iter, b, node_iter, k, k->u64s, k->u64s); - bch2_btree_iter_verify(iter, b); if (bkey_whiteout(&insert->k)) { reserve_whiteout(b, k); @@ -138,10 +181,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, clobber_u64s = 0; overwrite: bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch2_btree_node_iter_fix(iter, b, node_iter, k, - clobber_u64s, k->u64s); - bch2_btree_iter_verify(iter, b); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, k->u64s); return true; } @@ -400,8 +441,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->iter->level); BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - !bch2_extent_is_atomic(i->k, i->iter)); - + bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && !(trans->flags & BTREE_INSERT_ATOMIC)); } @@ -489,12 +529,12 @@ static int btree_trans_check_can_insert(struct btree_trans *trans, struct btree_insert_entry **stopped_at) { struct btree_insert_entry *i; - unsigned u64s = 0; + unsigned iter, u64s = 0; int ret; - trans_for_each_update_iter(trans, i) { + trans_for_each_update_sorted(trans, i, iter) { /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) + if (!same_leaf_as_prev(trans, iter)) u64s = 0; u64s += i->k->k.u64s; @@ -522,7 +562,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans, { return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES); + i->iter->btree_id == BTREE_ID_INODES || + i->iter->btree_id == BTREE_ID_REFLINK); } static inline bool update_has_triggers(struct btree_trans *trans, @@ -542,7 +583,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; - bool saw_non_marked; unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ? BCH_BUCKET_MARK_BUCKET_INVALIDATE : 0; @@ -551,31 +591,31 @@ static inline int do_btree_insert_at(struct btree_trans *trans, trans_for_each_update_iter(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); + /* + * note: running triggers will append more updates to the list of + * updates as we're walking it: + */ trans_for_each_update_iter(trans, i) - i->marked = false; + if (update_has_triggers(trans, i) && + update_triggers_transactional(trans, i)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + if (ret) + goto out_clear_replicas; + } - do { - saw_non_marked = false; + trans_for_each_update(trans, i) + btree_insert_entry_checks(trans, i); + bch2_btree_trans_verify_locks(trans); - trans_for_each_update_iter(trans, i) { - if (i->marked) - continue; - - saw_non_marked = true; - i->marked = true; - - if (update_has_triggers(trans, i) && - update_triggers_transactional(trans, i)) { - ret = bch2_trans_mark_update(trans, i->iter, i->k); - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip); - if (ret) - goto out_clear_replicas; - } - } - } while (saw_non_marked); + /* + * No more updates can be added - sort updates so we can take write + * locks in the correct order: + */ + btree_trans_sort_updates(trans); - btree_trans_lock_write(c, trans); + btree_trans_lock_write(trans, true); if (race_fault()) { ret = -EINTR; @@ -593,8 +633,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans, goto out; trans_for_each_update_iter(trans, i) { - if (i->deferred || - !btree_node_type_needs_gc(i->iter->btree_id)) + if (!btree_node_type_needs_gc(i->iter->btree_id)) continue; if (!fs_usage) { @@ -660,7 +699,7 @@ out: (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && trans->journal_res.ref); - btree_trans_unlock_write(trans); + btree_trans_lock_write(trans, false); if (fs_usage) { bch2_fs_usage_scratch_put(c, fs_usage); @@ -685,19 +724,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, { struct bch_fs *c = trans->c; unsigned flags = trans->flags; - struct btree_insert_entry *src, *dst; - - src = dst = trans->updates; - - while (src < trans->updates + trans->nr_updates) { - if (!src->triggered) { - *dst = *src; - dst++; - } - src++; - } - - trans->nr_updates = dst - trans->updates; /* * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree @@ -812,6 +838,7 @@ static int __bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + unsigned iter; int ret; trans_for_each_update_iter(trans, i) { @@ -833,8 +860,10 @@ static int __bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; - trans_for_each_update_leaf(trans, i) - bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); + trans_for_each_update_sorted(trans, i, iter) + if (!same_leaf_as_prev(trans, iter)) + bch2_foreground_maybe_merge(c, i->iter, + 0, trans->flags); trans->nounlock = false; @@ -853,8 +882,9 @@ int bch2_trans_commit(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned orig_mem_top = trans->mem_top; + struct btree_insert_entry *i = NULL; + unsigned orig_nr_updates = trans->nr_updates; + unsigned orig_mem_top = trans->mem_top; int ret = 0; if (!trans->nr_updates) @@ -875,10 +905,6 @@ int bch2_trans_commit(struct btree_trans *trans, trans->journal_seq = journal_seq; trans->flags = flags; - trans_for_each_update(trans, i) - btree_insert_entry_checks(trans, i); - bch2_btree_trans_verify_locks(trans); - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && !percpu_ref_tryget(&c->writes))) { if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) @@ -923,8 +949,6 @@ out_noupdates: bch2_trans_unlink_iters(trans, ~trans->iters_touched| trans->iters_unlink_on_commit); trans->iters_touched = 0; - } else { - bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit); } trans->nr_updates = 0; trans->mem_top = 0; @@ -933,39 +957,20 @@ out_noupdates: err: ret = bch2_trans_commit_error(trans, i, ret); + /* free updates and memory used by triggers, they'll be reexecuted: */ + trans->nr_updates = orig_nr_updates; + trans->mem_top = orig_mem_top; + /* can't loop if it was passed in and we changed it: */ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) ret = -EINTR; - if (!ret) { - /* free memory used by triggers, they'll be reexecuted: */ - trans->mem_top = orig_mem_top; + if (!ret) goto retry; - } goto out; } -struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans, - struct btree_insert_entry entry) -{ - struct btree_insert_entry *i; - - BUG_ON(trans->nr_updates >= trans->nr_iters + 4); - - for (i = trans->updates; - i < trans->updates + trans->nr_updates; - i++) - if (btree_trans_cmp(entry, *i) < 0) - break; - - memmove(&i[1], &i[0], - (void *) &trans->updates[trans->nr_updates] - (void *) i); - trans->nr_updates++; - *i = entry; - return i; -} - /** * bch2_btree_insert - insert keys into the extent btree * @c: pointer to struct bch_fs @@ -1033,7 +1038,10 @@ retry: /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); - bch2_extent_trim_atomic(&delete, iter); + + ret = bch2_extent_trim_atomic(&delete, iter); + if (ret) + break; } bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b6b3ac5111ca..6a4773a92029 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -405,7 +405,8 @@ int bch2_fs_usage_apply(struct bch_fs *c, */ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); if (WARN_ONCE(should_not_have_added > 0, - "disk usage increased without a reservation")) { + "disk usage increased by %lli without a reservation", + should_not_have_added)) { atomic64_sub(should_not_have_added, &c->sectors_available); added -= should_not_have_added; ret = -1; @@ -444,12 +445,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, percpu_rwsem_assert_held(&c->mark_lock); - bch2_fs_inconsistent_on(old.data_type && new.data_type && - old.data_type != new.data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_types[old.data_type], - bch2_data_types[new.data_type]); - preempt_disable(); dev_usage = this_cpu_ptr(ca->usage[gc]); @@ -504,14 +499,6 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c) } } -#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \ -({ \ - struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ - \ - bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \ - _old; \ -}) - static inline void update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry *r, @@ -520,7 +507,6 @@ static inline void update_replicas(struct bch_fs *c, int idx = bch2_replicas_entry_idx(c, r); BUG_ON(idx < 0); - BUG_ON(!sectors); switch (r->data_type) { case BCH_DATA_BTREE: @@ -569,8 +555,12 @@ static inline void update_replicas_list(struct btree_trans *trans, { struct replicas_delta_list *d; struct replicas_delta *n; - unsigned b = replicas_entry_bytes(r) + 8; + unsigned b; + + if (!sectors) + return; + b = replicas_entry_bytes(r) + 8; d = replicas_deltas_realloc(trans, b); n = (void *) d->d + d->used; @@ -629,17 +619,18 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; - old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + old = bucket_cmpxchg(g, new, ({ BUG_ON(!is_available_bucket(new)); new.owned_by_allocator = true; - new.dirty = true; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; new.gen++; })); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + if (old.cached_sectors) update_cached_sectors(c, fs_usage, ca->dev_idx, -((s64) old.cached_sectors)); @@ -668,10 +659,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; - old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + old = bucket_cmpxchg(g, new, ({ new.owned_by_allocator = owned_by_allocator; })); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -773,11 +766,16 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, type != BCH_DATA_JOURNAL); old = bucket_cmpxchg(g, new, ({ - new.dirty = true; new.data_type = type; overflow = checked_add(new.dirty_sectors, sectors); })); + bch2_fs_inconsistent_on(old.data_type && + old.data_type != type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[old.data_type], + bch2_data_types[type]); + bch2_fs_inconsistent_on(overflow, c, "bucket sector count overflow: %u + %u > U16_MAX", old.dirty_sectors, sectors); @@ -810,23 +808,24 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, } static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, - s64 delta) + unsigned offset, s64 delta, + unsigned flags) { - if (delta > 0) { - /* - * marking a new extent, which _will have size_ @delta - * - * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE - * case, we haven't actually created the key we'll be inserting - * yet (for the split) - so we don't want to be using - * k->size/crc.live_size here: - */ - return __ptr_disk_sectors(p, delta); + if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { + BUG_ON(offset + -delta > p.crc.live_size); + + return -((s64) ptr_disk_sectors(p)) + + __ptr_disk_sectors(p, offset) + + __ptr_disk_sectors(p, p.crc.live_size - + offset + delta); + } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { + BUG_ON(offset + -delta > p.crc.live_size); + + return -((s64) ptr_disk_sectors(p)) + + __ptr_disk_sectors(p, p.crc.live_size + + delta); } else { - BUG_ON(-delta > p.crc.live_size); - - return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) - - (s64) ptr_disk_sectors(p); + return ptr_disk_sectors(p); } } @@ -846,16 +845,35 @@ static void bucket_set_stripe(struct bch_fs *c, struct bucket *g = PTR_BUCKET(ca, ptr, gc); struct bucket_mark new, old; - BUG_ON(ptr_stale(ca, ptr)); - - old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ - new.dirty = true; + old = bucket_cmpxchg(g, new, ({ new.stripe = enabled; if (journal_seq) { new.journal_seq_valid = 1; new.journal_seq = journal_seq; } })); + + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + + /* + * XXX write repair code for these, flag stripe as possibly bad + */ + if (old.gen != ptr->gen) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "stripe with stale pointer"); +#if 0 + /* + * We'd like to check for these, but these checks don't work + * yet: + */ + if (old.stripe && enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "multiple stripes using same bucket"); + + if (!old.stripe && !enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "deleting stripe but bucket not marked as stripe bucket"); +#endif } } @@ -876,17 +894,23 @@ static bool bch2_mark_pointer(struct bch_fs *c, do { new.v.counter = old.v.counter = v; - new.dirty = true; - /* * Check this after reading bucket mark to guard against * the allocator invalidating a bucket after we've already * checked the gen */ - if (gen_after(new.gen, p.ptr.gen)) { - BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)); - EBUG_ON(!p.ptr.cached && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); + if (gen_after(p.ptr.gen, new.gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "pointer gen in the future"); + return true; + } + + if (new.gen != p.ptr.gen) { + /* XXX write repair code for this */ + if (!p.ptr.cached && + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "stale dirty pointer"); return true; } @@ -915,6 +939,14 @@ static bool bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); + if (old.data_type && old.data_type != data_type) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + new.gen, + bch2_data_types[old.data_type], + bch2_data_types[data_type]); + bch2_fs_inconsistent_on(overflow, c, "bucket sector count overflow: %u + %lli > U16_MAX", !p.ptr.cached @@ -950,7 +982,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", (u64) p.idx); - return -1; + return -EIO; } BUG_ON(m->r.e.data_type != data_type); @@ -985,7 +1017,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, } static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, enum bch_data_type data_type, + unsigned offset, s64 sectors, + enum bch_data_type data_type, struct bch_fs_usage *fs_usage, unsigned journal_seq, unsigned flags) { @@ -1006,12 +1039,12 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = data_type == BCH_DATA_BTREE ? sectors - : ptr_disk_sectors_delta(p, sectors); + : ptr_disk_sectors_delta(p, offset, sectors, flags); bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, fs_usage, journal_seq, flags); if (p.ptr.cached) { - if (disk_sectors && !stale) + if (!stale) update_cached_sectors(c, fs_usage, p.ptr.dev, disk_sectors); } else if (!p.ec_nr) { @@ -1030,8 +1063,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, } } - if (dirty_sectors) - update_replicas(c, fs_usage, &r.e, dirty_sectors); + update_replicas(c, fs_usage, &r.e, dirty_sectors); return 0; } @@ -1095,7 +1127,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, } int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c k, s64 sectors, + struct bkey_s_c k, + unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { @@ -1116,11 +1149,12 @@ int bch2_mark_key_locked(struct bch_fs *c, ? c->opts.btree_node_size : -c->opts.btree_node_size; - ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE, + ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: - ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER, + case KEY_TYPE_reflink_v: + ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, fs_usage, journal_seq, flags); break; case KEY_TYPE_stripe: @@ -1151,14 +1185,14 @@ int bch2_mark_key_locked(struct bch_fs *c, } int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, + unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { int ret; percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(c, k, sectors, + ret = bch2_mark_key_locked(c, k, offset, sectors, fs_usage, journal_seq, flags); percpu_up_read(&c->mark_lock); @@ -1174,8 +1208,11 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree *b = iter->l[0].b; + unsigned offset = 0; s64 sectors = 0; + flags |= BCH_BUCKET_MARK_OVERWRITE; + if (btree_node_is_extents(b) ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 : bkey_cmp(new->k.p, old.k->p)) @@ -1184,35 +1221,33 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, if (btree_node_is_extents(b)) { switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: + offset = 0; sectors = -((s64) old.k->size); break; case BCH_EXTENT_OVERLAP_BACK: + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); sectors = bkey_start_offset(&new->k) - old.k->p.offset; break; case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; sectors = bkey_start_offset(old.k) - new->k.p.offset; break; case BCH_EXTENT_OVERLAP_MIDDLE: - sectors = old.k->p.offset - new->k.p.offset; - BUG_ON(sectors <= 0); - - bch2_mark_key_locked(c, old, sectors, - fs_usage, trans->journal_res.seq, - BCH_BUCKET_MARK_INSERT|flags); - - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); + flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; break; } BUG_ON(sectors >= 0); } - return bch2_mark_key_locked(c, old, sectors, fs_usage, - trans->journal_res.seq, - BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1; + return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, + trans->journal_res.seq, flags) ?: 1; } int bch2_mark_update(struct btree_trans *trans, @@ -1230,12 +1265,10 @@ int bch2_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT)) - bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), - bpos_min(insert->k->k.p, b->key.k.p).offset - - bkey_start_offset(&insert->k->k), - fs_usage, trans->journal_res.seq, - BCH_BUCKET_MARK_INSERT|flags); + bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), + 0, insert->k->k.size, + fs_usage, trans->journal_res.seq, + BCH_BUCKET_MARK_INSERT|flags); if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) return 0; @@ -1280,7 +1313,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, xchg(&warned_disk_usage, 1)) return; - pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + bch_err(c, "disk usage increased more than %llu sectors reserved", + disk_res_sectors); trans_for_each_update_iter(trans, i) { struct btree_iter *iter = i->iter; @@ -1295,7 +1329,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, node_iter = iter->l[0].iter; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k; @@ -1321,16 +1355,18 @@ static int trans_get_key(struct btree_trans *trans, struct btree_iter **iter, struct bkey_s_c *k) { - unsigned i; + struct btree_insert_entry *i; int ret; - for (i = 0; i < trans->nr_updates; i++) - if (!trans->updates[i].deferred && - trans->updates[i].iter->btree_id == btree_id && - !bkey_cmp(pos, trans->updates[i].iter->pos)) { - *iter = trans->updates[i].iter; - *k = bkey_i_to_s_c(trans->updates[i].k); - return 0; + trans_for_each_update_iter(trans, i) + if (i->iter->btree_id == btree_id && + (btree_node_type_is_extents(btree_id) + ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && + bkey_cmp(pos, i->k->k.p) < 0 + : !bkey_cmp(pos, i->iter->pos))) { + *iter = i->iter; + *k = bkey_i_to_s_c(i->k); + return 1; } *iter = __bch2_trans_get_iter(trans, btree_id, pos, @@ -1338,6 +1374,8 @@ static int trans_get_key(struct btree_trans *trans, if (IS_ERR(*iter)) return PTR_ERR(*iter); + bch2_trans_iter_free_on_commit(trans, *iter); + *k = bch2_btree_iter_peek_slot(*iter); ret = bkey_err(*k); if (ret) @@ -1349,8 +1387,8 @@ static void *trans_update_key(struct btree_trans *trans, struct btree_iter *iter, unsigned u64s) { + struct btree_insert_entry *i; struct bkey_i *new_k; - unsigned i; new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); if (IS_ERR(new_k)) @@ -1359,19 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans, bkey_init(&new_k->k); new_k->k.p = iter->pos; - for (i = 0; i < trans->nr_updates; i++) - if (!trans->updates[i].deferred && - trans->updates[i].iter == iter) { - trans->updates[i].k = new_k; + trans_for_each_update_iter(trans, i) + if (i->iter == iter) { + i->k = new_k; return new_k; } - bch2_trans_update(trans, ((struct btree_insert_entry) { - .iter = iter, - .k = new_k, - .triggered = true, - })); - + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k)); return new_k; } @@ -1385,43 +1417,76 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bkey_s_c k; struct bkey_alloc_unpacked u; struct bkey_i_alloc *a; + unsigned old; bool overflow; int ret; ret = trans_get_key(trans, BTREE_ID_ALLOC, POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), &iter, &k); - if (ret) + if (ret < 0) return ret; - if (k.k->type != KEY_TYPE_alloc) { - bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu", - p.ptr.dev, - PTR_BUCKET_NR(ca, &p.ptr)); - ret = -1; - goto out; - } + if (!ret) { + /* + * During journal replay, and if gc repairs alloc info at + * runtime, the alloc info in the btree might not be up to date + * yet - so, trust the in memory mark: + */ + struct bucket *g; + struct bucket_mark m; - u = bch2_alloc_unpack(k); + percpu_down_read(&c->mark_lock); + g = bucket(ca, iter->pos.offset); + m = READ_ONCE(g->mark); + u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); + } else { + /* + * Unless we're already updating that key: + */ + if (k.k->type != KEY_TYPE_alloc) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "pointer to nonexistent bucket %llu:%llu", + iter->pos.inode, iter->pos.offset); + ret = -1; + goto out; + } + + u = bch2_alloc_unpack(k); + } if (gen_after(u.gen, p.ptr.gen)) { ret = 1; goto out; } - if (!p.ptr.cached) + if (u.data_type && u.data_type != data_type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s", + iter->pos.inode, iter->pos.offset, + u.gen, + bch2_data_types[u.data_type], + bch2_data_types[data_type]); + ret = -1; + goto out; + } + + if (!p.ptr.cached) { + old = u.dirty_sectors; overflow = checked_add(u.dirty_sectors, sectors); - else + } else { + old = u.cached_sectors; overflow = checked_add(u.cached_sectors, sectors); + } u.data_type = u.dirty_sectors || u.cached_sectors ? data_type : 0; bch2_fs_inconsistent_on(overflow, c, "bucket sector count overflow: %u + %lli > U16_MAX", - !p.ptr.cached - ? u.dirty_sectors - : u.cached_sectors, sectors); + old, sectors); + BUG_ON(overflow); a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); ret = PTR_ERR_OR_ZERO(a); @@ -1440,6 +1505,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, s64 sectors, enum bch_data_type data_type) { + struct bch_fs *c = trans->c; struct bch_replicas_padded r; struct btree_iter *iter; struct bkey_i *new_k; @@ -1449,17 +1515,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, s64 parity_sectors; int ret = 0; - BUG_ON(!sectors); - ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); - if (ret) + if (ret < 0) return ret; if (k.k->type != KEY_TYPE_stripe) { - bch_err_ratelimited(trans->c, - "pointer to nonexistent stripe %llu", - (u64) p.idx); - ret = -1; + bch2_fs_inconsistent(c, + "pointer to nonexistent stripe %llu", + (u64) p.idx); + ret = -EIO; goto out; } @@ -1491,8 +1555,9 @@ out: } static int bch2_trans_mark_extent(struct btree_trans *trans, - struct bkey_s_c k, - s64 sectors, enum bch_data_type data_type) + struct bkey_s_c k, unsigned offset, + s64 sectors, unsigned flags, + enum bch_data_type data_type) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1512,7 +1577,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = data_type == BCH_DATA_BTREE ? sectors - : ptr_disk_sectors_delta(p, sectors); + : ptr_disk_sectors_delta(p, offset, sectors, flags); ret = bch2_trans_mark_pointer(trans, p, disk_sectors, data_type); @@ -1522,7 +1587,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, stale = ret > 0; if (p.ptr.cached) { - if (disk_sectors && !stale) + if (!stale) update_cached_sectors_list(trans, p.ptr.dev, disk_sectors); } else if (!p.ec_nr) { @@ -1540,15 +1605,92 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, } } - if (dirty_sectors) - update_replicas_list(trans, &r.e, dirty_sectors); + update_replicas_list(trans, &r.e, dirty_sectors); return 0; } -int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, +static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_i *new_k; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_REFLINK, + POS(0, idx), &iter, &k); + if (ret < 0) + return ret; + + if (k.k->type != KEY_TYPE_reflink_v) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); + ret = -EIO; + goto err; + } + + if ((flags & BCH_BUCKET_MARK_OVERWRITE) && + (bkey_start_offset(k.k) < idx || + k.k->p.offset > idx + sectors)) + goto out; + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + new_k = trans_update_key(trans, iter, k.k->u64s); + ret = PTR_ERR_OR_ZERO(new_k); + if (ret) + goto err; + + bkey_reassemble(new_k, k); + r_v = bkey_i_to_reflink_v(new_k); + + le64_add_cpu(&r_v->v.refcount, + !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); + + if (!r_v->v.refcount) { + r_v->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&r_v->k, 0); + } +out: + ret = k.k->p.offset - idx; +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, unsigned offset, s64 sectors, unsigned flags) { + u64 idx = le64_to_cpu(p.v->idx) + offset; + s64 ret = 0; + + sectors = abs(sectors); + BUG_ON(offset + sectors > p.k->size); + + while (sectors) { + ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); + if (ret < 0) + break; + + idx += ret; + sectors = max_t(s64, 0LL, sectors - ret); + ret = 0; + } + + return ret; +} + +int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + unsigned offset, s64 sectors, unsigned flags) +{ struct replicas_delta_list *d; struct bch_fs *c = trans->c; @@ -1558,11 +1700,12 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ? c->opts.btree_node_size : -c->opts.btree_node_size; - return bch2_trans_mark_extent(trans, k, sectors, - BCH_DATA_BTREE); + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_BTREE); case KEY_TYPE_extent: - return bch2_trans_mark_extent(trans, k, sectors, - BCH_DATA_USER); + case KEY_TYPE_reflink_v: + return bch2_trans_mark_extent(trans, k, offset, sectors, + flags, BCH_DATA_USER); case KEY_TYPE_inode: d = replicas_deltas_realloc(trans, 0); @@ -1584,6 +1727,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, d->fs_usage.persistent_reserved[replicas - 1] += sectors; return 0; } + case KEY_TYPE_reflink_p: + return bch2_trans_mark_reflink_p(trans, + bkey_s_c_to_reflink_p(k), + offset, sectors, flags); default: return 0; } @@ -1601,19 +1748,21 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(insert), - bpos_min(insert->k.p, b->key.k.p).offset - - bkey_start_offset(&insert->k), - BCH_BUCKET_MARK_INSERT); + ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), + 0, insert->k.size, BCH_BUCKET_MARK_INSERT); if (ret) return ret; + if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) + return 0; + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k; + unsigned offset = 0; s64 sectors = 0; + unsigned flags = BCH_BUCKET_MARK_OVERWRITE; k = bkey_disassemble(b, _k, &unpacked); @@ -1625,35 +1774,32 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (btree_node_is_extents(b)) { switch (bch2_extent_overlap(&insert->k, k.k)) { case BCH_EXTENT_OVERLAP_ALL: + offset = 0; sectors = -((s64) k.k->size); break; case BCH_EXTENT_OVERLAP_BACK: + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); sectors = bkey_start_offset(&insert->k) - k.k->p.offset; break; case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; sectors = bkey_start_offset(k.k) - insert->k.p.offset; break; case BCH_EXTENT_OVERLAP_MIDDLE: - sectors = k.k->p.offset - insert->k.p.offset; - BUG_ON(sectors <= 0); - - ret = bch2_trans_mark_key(trans, k, sectors, - BCH_BUCKET_MARK_INSERT); - if (ret) - return ret; - - sectors = bkey_start_offset(&insert->k) - - k.k->p.offset; + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + sectors = -((s64) insert->k.size); + flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; break; } BUG_ON(sectors >= 0); } - ret = bch2_trans_mark_key(trans, k, sectors, - BCH_BUCKET_MARK_OVERWRITE); + ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); if (ret) return ret; @@ -1761,7 +1907,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_array *buckets = NULL, *old_buckets = NULL; unsigned long *buckets_nouse = NULL; - unsigned long *buckets_written = NULL; alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; @@ -1790,9 +1935,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || - !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)) || !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || @@ -1824,16 +1966,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(buckets_nouse, ca->buckets_nouse, BITS_TO_LONGS(n) * sizeof(unsigned long)); - memcpy(buckets_written, - ca->buckets_written, - BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->buckets[0], buckets); buckets = old_buckets; swap(ca->buckets_nouse, buckets_nouse); - swap(ca->buckets_written, buckets_written); if (resize) percpu_up_write(&c->mark_lock); @@ -1873,8 +2011,6 @@ err: free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); - kvpfree(buckets_written, - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (buckets) call_rcu(&old_buckets->rcu, buckets_free_rcu); @@ -1890,8 +2026,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); - kvpfree(ca->buckets_written, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(rcu_dereference_protected(ca->buckets[0], 1), diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 5ab6f3d34137..a4bab66d8d17 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -94,6 +94,15 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); } +static inline enum bch_data_type ptr_data_type(const struct bkey *k, + const struct bch_extent_ptr *ptr) +{ + if (k->type == KEY_TYPE_btree_ptr) + return BCH_DATA_BTREE; + + return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; +} + static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { @@ -251,14 +260,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, #define BCH_BUCKET_MARK_INSERT (1 << 0) #define BCH_BUCKET_MARK_OVERWRITE (1 << 1) -#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 2) -#define BCH_BUCKET_MARK_GC (1 << 3) -#define BCH_BUCKET_MARK_ALLOC_READ (1 << 4) -#define BCH_BUCKET_MARK_NOATOMIC (1 << 5) +#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2) +#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3) +#define BCH_BUCKET_MARK_GC (1 << 4) +#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5) +#define BCH_BUCKET_MARK_NOATOMIC (1 << 6) -int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64, +int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, +int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, unsigned); @@ -272,7 +282,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, void bch2_replicas_delta_list_apply(struct bch_fs *, struct bch_fs_usage *, struct replicas_delta_list *); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned); +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index e51d297976be..94bd9da34847 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -15,7 +15,6 @@ struct bucket_mark { u8 gen; u8 data_type:3, owned_by_allocator:1, - dirty:1, journal_seq_valid:1, stripe:1; u16 dirty_sectors; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 65b9714a1e58..e55aa98cf9ee 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <keys/user-type.h> @@ -61,27 +61,27 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t return crc32c(crc, data, len); case BCH_CSUM_CRC64_NONZERO: case BCH_CSUM_CRC64: - return bch2_crc64_update(crc, data, len); + return crc64_be(crc, data, len); default: BUG(); } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -462,7 +463,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -545,7 +546,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -573,7 +574,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -605,7 +606,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 657679f43b02..b84e81bac8ff 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -25,11 +25,6 @@ static inline bool bch2_checksum_mergeable(unsigned type) struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, struct bch_csum, size_t); -static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len) -{ - return crc64_be(crc, p, len); -} - #define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) #define BCH_NONCE_BTREE cpu_to_le32(2 << 28) #define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) @@ -143,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index a7264d802ed7..3787390da47f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); #ifndef CONFIG_HIGHMEM - __bio_for_each_contig_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (bv.bv_len == start.bi_size) return (struct bbuf) { .b = page_address(bv.bv_page) + bv.bv_offset, @@ -241,10 +241,10 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, } /* - * might have to free existing pages and retry allocation from mempool - - * do this _after_ decompressing: + * XXX: don't have a good way to assert that the bio was allocated with + * enough space, we depend on bch2_move_extent doing the right thing */ - bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9); + bio->bi_iter.bi_size = crc->live_size << 9; memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index c758982bc1af..69b123bad83b 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -70,8 +70,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); - bch2_bio_map(bio, n_sorted); + bch2_bio_map(bio, n_sorted, btree_bytes(c)); submit_bio_wait(bio); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index dba861111a8d..be2eca0fcdf7 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -162,19 +162,20 @@ static int extent_matches_stripe(struct bch_fs *c, struct bch_stripe *v, struct bkey_s_c k) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - int idx; - if (!bkey_extent_is_data(k.k)) - return -1; - - e = bkey_s_c_to_extent(k); + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + int idx; - extent_for_each_ptr(e, ptr) { - idx = ptr_matches_stripe(c, v, ptr); - if (idx >= 0) - return idx; + extent_for_each_ptr(e, ptr) { + idx = ptr_matches_stripe(c, v, ptr); + if (idx >= 0) + return idx; + } + break; + } } return -1; @@ -182,19 +183,20 @@ static int extent_matches_stripe(struct bch_fs *c, static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) { - struct bkey_s_c_extent e; - const union bch_extent_entry *entry; - - if (!bkey_extent_is_data(k.k)) - return false; + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; - e = bkey_s_c_to_extent(k); + extent_for_each_entry(e, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; - extent_for_each_entry(e, entry) - if (extent_entry_type(entry) == - BCH_EXTENT_ENTRY_stripe_ptr && - entry->stripe_ptr.idx == idx) - return true; + break; + } + } return false; } @@ -399,11 +401,10 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bio_set_op_attrs(&ec_bio->bio, rw, 0); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); - ec_bio->bio.bi_iter.bi_size = b; ec_bio->bio.bi_end_io = ec_block_endio; ec_bio->bio.bi_private = cl; - bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset); + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); percpu_ref_get(&ca->io_ref); @@ -576,7 +577,8 @@ static ssize_t stripe_idx_to_delete(struct bch_fs *c) { ec_stripes_heap *h = &c->ec_stripes_heap; - return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1; + return h->used && h->data[0].blocks_nonempty == 0 + ? h->data[0].idx : -1; } static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, @@ -627,7 +629,8 @@ void bch2_stripes_heap_update(struct bch_fs *c, bch2_stripes_heap_insert(c, m, idx); } - if (stripe_idx_to_delete(c) >= 0) + if (stripe_idx_to_delete(c) >= 0 && + !percpu_ref_is_dying(&c->writes)) schedule_work(&c->ec_stripe_delete_work); } @@ -685,7 +688,8 @@ static void ec_stripe_delete_work(struct work_struct *work) if (idx < 0) break; - ec_stripe_delete(c, idx); + if (ec_stripe_delete(c, idx)) + break; } mutex_unlock(&c->ec_stripe_create_lock); @@ -700,26 +704,34 @@ static int ec_stripe_bkey_insert(struct bch_fs *c, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + struct bpos start_pos = POS(0, c->ec_stripe_hint); int ret; bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); - /* XXX: start pos hint */ - for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { + start_pos = POS_MIN; + bch2_btree_iter_set_pos(iter, start_pos); + continue; + } + + ret = -ENOSPC; break; + } if (bkey_deleted(k.k)) goto found_slot; } - if (!ret) - ret = -ENOSPC; goto err; found_slot: + start_pos = iter->pos; + ret = ec_stripe_mem_alloc(c, iter); if (ret) goto err; @@ -734,6 +746,8 @@ found_slot: err: if (ret == -EINTR) goto retry; + + c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; bch2_trans_exit(&trans); return ret; @@ -1159,12 +1173,8 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) struct ec_stripe_new *s = NULL; mutex_lock(&h->lock); - bch2_open_buckets_stop_dev(c, ca, - &h->blocks, - BCH_DATA_USER); - bch2_open_buckets_stop_dev(c, ca, - &h->parity, - BCH_DATA_USER); + bch2_open_buckets_stop_dev(c, ca, &h->blocks); + bch2_open_buckets_stop_dev(c, ca, &h->parity); if (!h->s) goto unlock; @@ -1265,10 +1275,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) { - struct journal_key *i; struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + struct btree_iter *btree_iter; + struct journal_iter journal_iter; + struct bkey_s_c btree_k, journal_k, k; int ret; ret = bch2_fs_ec_start(c); @@ -1277,10 +1287,41 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret) - bch2_mark_key(c, k, 0, NULL, 0, + btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0); + journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC); + + btree_k = bch2_btree_iter_peek(btree_iter); + journal_k = bch2_journal_iter_peek(&journal_iter); + + while (1) { + if (btree_k.k && journal_k.k) { + int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); + + if (cmp < 0) { + k = btree_k; + btree_k = bch2_btree_iter_next(btree_iter); + } else if (cmp == 0) { + btree_k = bch2_btree_iter_next(btree_iter); + k = journal_k; + journal_k = bch2_journal_iter_next(&journal_iter); + } else { + k = journal_k; + journal_k = bch2_journal_iter_next(&journal_iter); + } + } else if (btree_k.k) { + k = btree_k; + btree_k = bch2_btree_iter_next(btree_iter); + } else if (journal_k.k) { + k = journal_k; + journal_k = bch2_journal_iter_next(&journal_iter); + } else { + break; + } + + bch2_mark_key(c, k, 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); + } ret = bch2_trans_exit(&trans) ?: ret; if (ret) { @@ -1288,13 +1329,6 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) return ret; } - for_each_journal_key(*journal_keys, i) - if (i->btree_id == BTREE_ID_EC) - bch2_mark_key(c, bkey_i_to_s_c(i->k), - 0, NULL, 0, - BCH_BUCKET_MARK_ALLOC_READ| - BCH_BUCKET_MARK_NOATOMIC); - return 0; } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 1aaff44e18cf..304ff92500be 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -4,6 +4,8 @@ #include "io.h" #include "super.h" +#define FSCK_ERR_RATELIMIT_NR 10 + bool bch2_inconsistent_error(struct bch_fs *c) { set_bit(BCH_FS_ERROR, &c->flags); @@ -97,8 +99,8 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, found: list_move(&s->list, &c->fsck_errors); s->nr++; - suppressing = s->nr == 10; - print = s->nr <= 10; + suppressing = s->nr == FSCK_ERR_RATELIMIT_NR; + print = s->nr <= FSCK_ERR_RATELIMIT_NR; buf = s->buf; print: va_start(args, fmt); @@ -152,10 +154,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c) struct fsck_err_state *s, *n; mutex_lock(&c->fsck_error_lock); - set_bit(BCH_FS_FSCK_DONE, &c->flags); list_for_each_entry_safe(s, n, &c->fsck_errors, list) { - if (s->nr > 10) + if (s->nr > FSCK_ERR_RATELIMIT_NR) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); list_del(&s->list); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e286048b5bf8..4b1c652cdbce 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -46,7 +46,8 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) switch (k.k->type) { case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -250,6 +251,33 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + /* extent specific utility code */ const struct bch_extent_ptr * @@ -280,50 +308,32 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group return NULL; } -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return ptr; - - return NULL; -} - unsigned bch2_extent_is_compressed(struct bkey_s_c k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; unsigned ret = 0; - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE) - ret += p.crc.compressed_size; - } - } + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; return ret; } -bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, - struct bch_extent_ptr m, u64 offset) +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev == m.dev && p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == (s64) m.offset - offset) return true; @@ -390,16 +400,17 @@ static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, bch2_csum_type_is_encryption(n.csum_type); } -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; if (!n.csum_type) return false; - extent_for_each_crc(e, crc, i) + bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) return true; @@ -415,9 +426,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, * currently live (so that readers won't have to bounce) while we've got the * checksum we need: */ -bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked n) +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; struct extent_ptr_decoded p; union bch_extent_entry *i; @@ -425,7 +436,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { - extent_for_each_crc(extent_i_to_s(e), u, i) + bkey_for_each_crc(&k->k, ptrs, u, i) if (!u.compression_type && u.csum_type && u.live_size == u.uncompressed_size) { @@ -437,15 +448,17 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, found: BUG_ON(n.compression_type); BUG_ON(n.offset); - BUG_ON(n.live_size != e->k.size); + BUG_ON(n.live_size != k->k.size); restart_narrow_pointers: - extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr); + bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; - bch2_extent_ptr_decoded_append(e, &p); + bch2_extent_ptr_decoded_append(k, &p); ret = true; goto restart_narrow_pointers; } @@ -659,8 +672,7 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) return bch2_bkey_ptrs_invalid(c, k); } -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -708,44 +720,48 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, /* Extents */ -bool __bch2_cut_front(struct bpos where, struct bkey_s k) +void __bch2_cut_front(struct bpos where, struct bkey_s k) { - u64 len = 0; + u64 sub; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return false; + return; EBUG_ON(bkey_cmp(where, k.k->p) > 0); - len = k.k->p.offset - where.offset; + sub = where.offset - bkey_start_offset(k.k); - BUG_ON(len > k.k->size); + k.k->size -= sub; - /* - * Don't readjust offset if the key size is now 0, because that could - * cause offset to point to the next bucket: - */ - if (!len) + if (!k.k->size) k.k->type = KEY_TYPE_deleted; - else if (bkey_extent_is_data(k.k)) { - struct bkey_s_extent e = bkey_s_to_extent(k); + + switch (k.k->type) { + case KEY_TYPE_deleted: + case KEY_TYPE_discard: + case KEY_TYPE_error: + case KEY_TYPE_cookie: + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false; - extent_for_each_entry(e, entry) { + bkey_extent_entry_for_each(ptrs, entry) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: if (!seen_crc) - entry->ptr.offset += e.k->size - len; + entry->ptr.offset += sub; break; case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += e.k->size - len; + entry->crc32.offset += sub; break; case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += e.k->size - len; + entry->crc64.offset += sub; break; case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += e.k->size - len; + entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: break; @@ -754,11 +770,20 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k) if (extent_entry_is_crc(entry)) seen_crc = true; } - } - k.k->size = len; + break; + } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - return true; + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_reservation: + break; + default: + BUG(); + } } bool bch2_cut_back(struct bpos where, struct bkey *k) @@ -772,8 +797,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) len = where.offset - bkey_start_offset(k); - BUG_ON(len > k->size); - k->p = where; k->size = len; @@ -783,19 +806,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) return true; } -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -void bch2_key_resize(struct bkey *k, - unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} - static bool extent_i_save(struct btree *b, struct bkey_packed *dst, struct bkey_i *src) { @@ -866,13 +876,6 @@ static void verify_extent_nonoverlapping(struct bch_fs *c, #endif } -static void verify_modified_extent(struct btree_iter *iter, - struct bkey_packed *k) -{ - bch2_btree_iter_verify(iter, iter->l[0].b); - bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); -} - static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert) { @@ -885,6 +888,9 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); verify_extent_nonoverlapping(c, l->b, &l->iter, insert); + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); + node_iter = l->iter; k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); if (k && !bkey_written(l->b, k) && @@ -897,11 +903,20 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) return; + /* + * may have skipped past some deleted extents greater than the insert + * key, before we got to a non deleted extent and knew we could bail out + * rewind the iterator a bit if necessary: + */ + node_iter = l->iter; + while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && + bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0) + l->iter = node_iter; + k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); bch2_bset_insert(l->b, &l->iter, k, insert, 0); bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); - bch2_btree_iter_verify(iter, l->b); } static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) @@ -921,47 +936,132 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return ret; } -static inline struct bpos -bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter) +static int __bch2_extent_atomic_end(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters) +{ + int ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + return 0; + } + + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = end->offset - bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + + *nr_iters += 1; + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; + + *end = bpos_min(*end, pos); + break; + } + } + + bch2_trans_iter_put(trans, iter); + break; + } + } + + return ret; +} + +int bch2_extent_atomic_end(struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) { + struct btree_trans *trans = iter->trans; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; struct bkey_packed *_k; - unsigned nr_alloc_ptrs = + unsigned nr_iters = bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert)); + int ret = 0; BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + *end = bpos_min(insert->k.p, b->key.k.p); + + ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert), + 0, end, &nr_iters, 10); + if (ret) + return ret; + + while (nr_iters < 20 && + (_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + unsigned offset = 0; - if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) break; - nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k); + if (bkey_cmp(bkey_start_pos(&insert->k), + bkey_start_pos(k.k)) > 0) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); - if (nr_alloc_ptrs > 20) { - BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0); - return bpos_min(insert->k.p, k.k->p); - } + ret = __bch2_extent_atomic_end(trans, k, offset, + end, &nr_iters, 20); + if (ret) + return ret; + + if (nr_iters >= 20) + break; bch2_btree_node_iter_advance(&node_iter, b); } - return bpos_min(insert->k.p, b->key.k.p); + return 0; } -void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) { - bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k); + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, &k->k); + return 0; } -bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) { - return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p); + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; + + return !bkey_cmp(end, k->k.p); } enum btree_insert_ret @@ -1031,15 +1131,16 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); + EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ bch2_cut_back(bkey_start_pos(&insert->k), k.k); - BUG_ON(bkey_deleted(k.k)); + EBUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); /* @@ -1050,7 +1151,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, bch2_bset_fix_invalidated_key(l->b, _k); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_ALL: { @@ -1067,12 +1167,10 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, bch2_bset_delete(l->b, _k, _k->u64s); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); - bch2_btree_iter_verify(iter, l->b); } else { extent_save(l->b, _k, k.k); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, _k->u64s, _k->u64s); - verify_modified_extent(iter, _k); } break; @@ -1102,7 +1200,8 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); extent_save(l->b, _k, k.k); - verify_modified_extent(iter, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); extent_bset_insert(c, iter, &split.k); break; @@ -1159,6 +1258,8 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c, btree_account_key_drop(l->b, _k); _k->type = KEY_TYPE_discard; reserve_whiteout(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); } break; } @@ -1185,19 +1286,6 @@ next: overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; } - - /* - * may have skipped past some deleted extents greater than the insert - * key, before we got to a non deleted extent and knew we could bail out - * rewind the iterator a bit if necessary: - */ - { - struct btree_node_iter node_iter = l->iter; - - while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && - bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0) - l->iter = node_iter; - } } /** @@ -1265,12 +1353,7 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, if (s.deleting) tmp.k.k.type = KEY_TYPE_discard; -#if 0 - /* disabled due to lock recursion - mark_lock: */ - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, - bkey_i_to_s_c(&tmp.k)); -#endif + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); extent_bset_insert(c, iter, &tmp.k); @@ -1295,8 +1378,7 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) return bch2_bkey_ptrs_invalid(c, k); } -void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; @@ -1312,11 +1394,13 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, * going to get overwritten during replay) */ - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - + if (percpu_down_read_trylock(&c->mark_lock)) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); + percpu_up_read(&c->mark_lock); + } /* * If journal replay hasn't finished, we might be seeing keys * that will be overwritten by the time journal replay is done: @@ -1394,9 +1478,12 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, #undef set_common_fields } -static void bch2_extent_crc_init(union bch_extent_crc *crc, - struct bch_extent_crc_unpacked new) +static void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + if (bch_crc_bytes[new.csum_type] <= 4 && new.uncompressed_size - 1 <= CRC32_SIZE_MAX && new.nonce <= CRC32_NONCE_MAX) @@ -1413,54 +1500,53 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc, BUG(); bch2_extent_crc_pack(crc, new); -} -void bch2_extent_crc_append(struct bkey_i_extent *e, - struct bch_extent_crc_unpacked new) -{ - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); } -static inline void __extent_entry_insert(struct bkey_i_extent *e, +static inline void __extent_entry_insert(struct bkey_i *k, union bch_extent_entry *dst, union bch_extent_entry *new) { - union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), dst, (u64 *) end - (u64 *) dst); - e->k.u64s += extent_entry_u64s(new); + k->k.u64s += extent_entry_u64s(new); memcpy(dst, new, extent_entry_bytes(new)); } -void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, +void bch2_extent_ptr_decoded_append(struct bkey_i *k, struct extent_ptr_decoded *p) { - struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); union bch_extent_entry *pos; unsigned i; if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = e->v.start; + pos = ptrs.start; goto found; } - extent_for_each_crc(extent_i_to_s(e), crc, pos) + bkey_for_each_crc(&k->k, ptrs, crc, pos) if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = extent_entry_next(pos); goto found; } - bch2_extent_crc_append(e, p->crc); - pos = extent_entry_last(extent_i_to_s(e)); + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ptr)); + __extent_entry_insert(k, pos, to_entry(&p->ptr)); for (i = 0; i < p->ec_nr; i++) { p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + __extent_entry_insert(k, pos, to_entry(&p->ec[i])); } } @@ -1482,22 +1568,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) /* will only happen if all pointers were cached: */ if (!bkey_val_u64s(k.k)) - k.k->type = KEY_TYPE_deleted; + k.k->type = KEY_TYPE_discard; - return false; + return bkey_whiteout(k.k); } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && @@ -1508,7 +1594,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, } if (extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { @@ -1666,6 +1752,12 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, if (ret == BCH_MERGE_NOMERGE) return false; + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k)); + if (debug_check_bkeys(c) && + ret == BCH_MERGE_PARTIAL) + bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k)); + /* * check if we overlap with deleted extents - would break the sort * order: @@ -1702,7 +1794,6 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, bch2_bset_fix_invalidated_key(b, m); bch2_btree_node_iter_fix(iter, b, node_iter, m, m->u64s, m->u64s); - verify_modified_extent(iter, m); return ret == BCH_MERGE_MERGE; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index fe92737354bd..613d76af69d9 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -12,7 +12,8 @@ struct btree_insert_entry; /* extent entries: */ -#define extent_entry_last(_e) bkey_val_end(_e) +#define extent_entry_last(_e) \ + ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) #define entry_to_ptr(_entry) \ ({ \ @@ -258,6 +259,27 @@ out: \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ _ptr, _entry) +#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +({ \ + __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ + if (extent_entry_is_crc(_iter)) { \ + (_crc) = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_iter)); \ + break; \ + } \ + \ + (_iter) < (_end); \ +}) + +#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ + for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ + (_iter) = (_start); \ + bkey_crc_next(_k, _start, _end, _crc, _iter); \ + (_iter) = extent_entry_next(_iter)) + +#define bkey_for_each_crc(_k, _p, _crc, _iter) \ + __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) + /* utility code common to all keys with pointers: */ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) @@ -267,7 +289,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); return (struct bkey_ptrs_c) { to_entry(&e.v->start[0]), - to_entry(bkey_val_end(e)) + to_entry(extent_entry_last(e)) }; } case KEY_TYPE_extent: { @@ -284,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) to_entry(&s.v->ptrs[s.v->nr_blocks]), }; } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } default: return (struct bkey_ptrs_c) { NULL, NULL }; } @@ -337,18 +367,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - if (ptr->dev == dev) - return ptr; - - return NULL; -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); @@ -359,6 +377,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *); +void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); +void bch2_bkey_drop_device(struct bkey_s, unsigned); +const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); @@ -366,8 +389,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); /* bch_btree_ptr: */ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *, - struct bkey_s_c); +void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); @@ -382,7 +404,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); /* bch_extent: */ const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); enum merge_result bch2_extent_merge(struct bch_fs *, @@ -410,8 +432,10 @@ enum merge_result bch2_reservation_merge(struct bch_fs *, .key_merge = bch2_reservation_merge, \ } -void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, + struct bpos *); +int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, @@ -419,52 +443,51 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, void bch2_insert_fixup_extent(struct btree_trans *, struct btree_insert_entry *); -void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, - unsigned, unsigned); +void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned); unsigned bch2_extent_is_compressed(struct bkey_s_c); -bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, - struct bch_extent_ptr, u64); +bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_extent_ptr, u64); -static inline bool bkey_extent_is_data(const struct bkey *k) +static inline bool bkey_extent_is_direct_data(const struct bkey *k) { switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: return true; default: return false; } } +static inline bool bkey_extent_is_data(const struct bkey *k) +{ + return bkey_extent_is_direct_data(k) || + k->type == KEY_TYPE_reflink_p; +} + +/* + * Should extent be counted under inode->i_sectors? + */ static inline bool bkey_extent_is_allocation(const struct bkey *k) { switch (k->type) { case KEY_TYPE_extent: case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; } } -static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k) -{ - return bkey_extent_is_allocation(k.k) && - !bch2_extent_is_compressed(k); -} - -void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -void bch2_bkey_drop_device(struct bkey_s, unsigned); - /* Extent entry iteration: */ #define extent_for_each_entry_from(_e, _entry, _start) \ @@ -480,45 +503,16 @@ void bch2_bkey_drop_device(struct bkey_s, unsigned); #define extent_for_each_ptr(_e, _ptr) \ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) -#define extent_crc_next(_e, _crc, _iter) \ -({ \ - extent_for_each_entry_from(_e, _iter, _iter) \ - if (extent_entry_is_crc(_iter)) { \ - (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\ - break; \ - } \ - \ - (_iter) < extent_entry_last(_e); \ -}) - -#define extent_for_each_crc(_e, _crc, _iter) \ - for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ - (_iter) = (_e).v->start; \ - extent_crc_next(_e, _crc, _iter); \ - (_iter) = extent_entry_next(_iter)) - #define extent_for_each_ptr_decode(_e, _ptr, _entry) \ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ extent_entry_last(_e), _ptr, _entry) -void bch2_extent_crc_append(struct bkey_i_extent *, - struct bch_extent_crc_unpacked); -void bch2_extent_ptr_decoded_append(struct bkey_i_extent *, +void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -static inline void __extent_entry_push(struct bkey_i_extent *e) -{ - union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); - - EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > - BKEY_EXTENT_VAL_U64s_MAX); - - e->k.u64s += extent_entry_u64s(entry); -} - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, +bool bch2_can_narrow_extent_crcs(struct bkey_s_c, struct bch_extent_crc_unpacked); -bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -540,15 +534,26 @@ do { \ } \ } while (0) -bool __bch2_cut_front(struct bpos, struct bkey_s); +void __bch2_cut_front(struct bpos, struct bkey_s); -static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k) +static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) { - return __bch2_cut_front(where, bkey_i_to_s(k)); + __bch2_cut_front(where, bkey_i_to_s(k)); } bool bch2_cut_back(struct bpos, struct bkey *); -void bch2_key_resize(struct bkey *, unsigned); + +/** + * bch_key_resize - adjust size of @k + * + * bkey_start_offset(k) will be preserved, modifies where the extent ends + */ +static inline void bch2_key_resize(struct bkey *k, unsigned new_size) +{ + k->p.offset -= k->size; + k->p.offset += new_size; + k->size = new_size; +} /* * In extent_sort_fix_overlapping(), insert_fixup_extent(), diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 1c4caa6b3a98..16b79f371853 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -16,6 +16,7 @@ #include "io.h" #include "keylist.h" #include "quota.h" +#include "reflink.h" #include <linux/aio.h> #include <linux/backing-dev.h> @@ -193,9 +194,9 @@ static int inode_set_size(struct bch_inode_info *inode, return 0; } -static int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) { struct inode_new_size s = { .new_size = new_size, @@ -277,16 +278,16 @@ static int sum_sector_overwrites(struct btree_trans *trans, return 0; } -static int bch2_extent_update(struct btree_trans *trans, - struct bch_inode_info *inode, - struct disk_reservation *disk_res, - struct quota_res *quota_res, - struct btree_iter *extent_iter, - struct bkey_i *k, - u64 new_i_size, - bool may_allocate, - bool direct, - s64 *total_delta) +int bch2_extent_update(struct btree_trans *trans, + struct bch_inode_info *inode, + struct disk_reservation *disk_res, + struct quota_res *quota_res, + struct btree_iter *extent_iter, + struct bkey_i *k, + u64 new_i_size, + bool may_allocate, + bool direct, + s64 *total_delta) { struct bch_fs *c = trans->c; struct btree_iter *inode_iter = NULL; @@ -298,13 +299,13 @@ static int bch2_extent_update(struct btree_trans *trans, s64 i_sectors_delta; int ret; - bch2_trans_begin_updates(trans); - ret = bch2_btree_iter_traverse(extent_iter); if (ret) return ret; - bch2_extent_trim_atomic(k, extent_iter); + ret = bch2_extent_trim_atomic(k, extent_iter); + if (ret) + return ret; ret = sum_sector_overwrites(trans, extent_iter, k, &allocating, @@ -448,6 +449,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop) bkey_copy(&tmp.k, bch2_keylist_front(keys)); + bch2_trans_begin_updates(&trans); + ret = bch2_extent_update(&trans, inode, &wop->res, quota_res, iter, &tmp.k, @@ -501,181 +504,272 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info /* stored in page->private: */ -/* - * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could - * almost protected it with the page lock, except that bch2_writepage_io_done has - * to update the sector counts (and from interrupt/bottom half context). - */ -struct bch_page_state { -union { struct { - /* existing data: */ - unsigned sectors:PAGE_SECTOR_SHIFT + 1; - +struct bch_page_sector { /* Uncompressed, fully allocated replicas: */ - unsigned nr_replicas:4; + unsigned nr_replicas:3; /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ - unsigned replicas_reserved:4; - - /* Owns PAGE_SECTORS sized quota reservation: */ - unsigned quota_reserved:1; - - /* - * Number of sectors on disk - for i_blocks - * Uncompressed size, not compressed size: - */ - unsigned dirty_sectors:PAGE_SECTOR_SHIFT + 1; -}; - /* for cmpxchg: */ - unsigned long v; -}; + unsigned replicas_reserved:3; + + /* i_sectors: */ + enum { + SECTOR_UNALLOCATED, + SECTOR_RESERVED, + SECTOR_DIRTY, + SECTOR_ALLOCATED, + } state:2; }; -#define page_state_cmpxchg(_ptr, _new, _expr) \ -({ \ - unsigned long _v = READ_ONCE((_ptr)->v); \ - struct bch_page_state _old; \ - \ - do { \ - _old.v = _new.v = _v; \ - _expr; \ - \ - EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\ - } while (_old.v != _new.v && \ - (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v); \ - \ - _old; \ -}) +struct bch_page_state { + atomic_t write_count; + struct bch_page_sector s[PAGE_SECTORS]; +}; -static inline struct bch_page_state *page_state(struct page *page) +static inline struct bch_page_state *__bch2_page_state(struct page *page) { - struct bch_page_state *s = (void *) &page->private; - - BUILD_BUG_ON(sizeof(*s) > sizeof(page->private)); + return page_has_private(page) + ? (struct bch_page_state *) page_private(page) + : NULL; +} - if (!PagePrivate(page)) - SetPagePrivate(page); +static inline struct bch_page_state *bch2_page_state(struct page *page) +{ + EBUG_ON(!PageLocked(page)); - return s; + return __bch2_page_state(page); } -static inline unsigned page_res_sectors(struct bch_page_state s) +/* for newly allocated pages: */ +static void __bch2_page_state_release(struct page *page) { + struct bch_page_state *s = __bch2_page_state(page); + + if (!s) + return; - return s.replicas_reserved * PAGE_SECTORS; + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + kfree(s); } -static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct bch_page_state s) +static void bch2_page_state_release(struct page *page) { - struct disk_reservation res = { .sectors = page_res_sectors(s) }; - struct quota_res quota_res = { .sectors = s.quota_reserved ? PAGE_SECTORS : 0 }; + struct bch_page_state *s = bch2_page_state(page); - bch2_quota_reservation_put(c, inode, "a_res); - bch2_disk_reservation_put(c, &res); + if (!s) + return; + + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + kfree(s); } -static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page) +/* for newly allocated pages: */ +static struct bch_page_state *__bch2_page_state_create(struct page *page, + gfp_t gfp) { - struct bch_page_state s; - - EBUG_ON(!PageLocked(page)); + struct bch_page_state *s; - s = page_state_cmpxchg(page_state(page), s, { - s.replicas_reserved = 0; - s.quota_reserved = 0; - }); + s = kzalloc(sizeof(*s), GFP_NOFS|gfp); + if (!s) + return NULL; - __bch2_put_page_reservation(c, inode, s); + /* + * migrate_page_move_mapping() assumes that pages with private data + * have their count elevated by 1. + */ + get_page(page); + set_page_private(page, (unsigned long) s); + SetPagePrivate(page); + return s; } -static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page, bool check_enospc) +static struct bch_page_state *bch2_page_state_create(struct page *page, + gfp_t gfp) { - struct bch_page_state *s = page_state(page), new; + return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); +} +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +{ /* XXX: this should not be open coded */ - unsigned nr_replicas = inode->ei_inode.bi_data_replicas + return inode->ei_inode.bi_data_replicas ? inode->ei_inode.bi_data_replicas - 1 : c->opts.data_replicas; - struct disk_reservation disk_res; - struct quota_res quota_res = { 0 }; +} + +static inline unsigned sectors_to_reserve(struct bch_page_sector *s, + unsigned nr_replicas) +{ + return max(0, (int) nr_replicas - + s->nr_replicas - + s->replicas_reserved); +} + +static int bch2_get_page_disk_reservation(struct bch_fs *c, + struct bch_inode_info *inode, + struct page *page, bool check_enospc) +{ + struct bch_page_state *s = bch2_page_state_create(page, 0); + unsigned nr_replicas = inode_nr_replicas(c, inode); + struct disk_reservation disk_res = { 0 }; + unsigned i, disk_res_sectors = 0; int ret; - EBUG_ON(!PageLocked(page)); + if (!s) + return -ENOMEM; - if (s->replicas_reserved < nr_replicas) { - ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS, - nr_replicas - s->replicas_reserved, - !check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (unlikely(ret)) - return ret; + for (i = 0; i < ARRAY_SIZE(s->s); i++) + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); + + if (!disk_res_sectors) + return 0; + + ret = bch2_disk_reservation_get(c, &disk_res, + disk_res_sectors, 1, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ARRAY_SIZE(s->s); i++) + s->s[i].replicas_reserved += + sectors_to_reserve(&s->s[i], nr_replicas); + + return 0; +} + +struct bch2_page_reservation { + struct disk_reservation disk; + struct quota_res quota; +}; + +static void bch2_page_reservation_init(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_page_reservation *res) +{ + memset(res, 0, sizeof(*res)); + + res->disk.nr_replicas = inode_nr_replicas(c, inode); +} + +static void bch2_page_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_page_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +static int bch2_page_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, struct page *page, + struct bch2_page_reservation *res, + unsigned offset, unsigned len, bool check_enospc) +{ + struct bch_page_state *s = bch2_page_state_create(page, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; - page_state_cmpxchg(s, new, ({ - BUG_ON(new.replicas_reserved + - disk_res.nr_replicas != nr_replicas); - new.replicas_reserved += disk_res.nr_replicas; - })); + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; } - if (!s->quota_reserved && - s->sectors + s->dirty_sectors < PAGE_SECTORS) { - ret = bch2_quota_reservation_add(c, inode, "a_res, - PAGE_SECTORS, - check_enospc); + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, + disk_sectors, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); if (unlikely(ret)) return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, + check_enospc); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; - page_state_cmpxchg(s, new, ({ - BUG_ON(new.quota_reserved); - new.quota_reserved = 1; - })); + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } } - return ret; + return 0; } static void bch2_clear_page_bits(struct page *page) { struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_page_state s; - - EBUG_ON(!PageLocked(page)); + struct bch_page_state *s = bch2_page_state(page); + struct disk_reservation disk_res = { 0 }; + int i, dirty_sectors = 0; - if (!PagePrivate(page)) + if (!s) return; - s.v = xchg(&page_state(page)->v, 0); - ClearPagePrivate(page); + for (i = 0; i < ARRAY_SIZE(s->s); i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + + if (s->s[i].state == SECTOR_DIRTY) { + dirty_sectors++; + s->s[i].state = SECTOR_UNALLOCATED; + } + } + + bch2_disk_reservation_put(c, &disk_res); - if (s.dirty_sectors) - i_sectors_acct(c, inode, NULL, -s.dirty_sectors); + if (dirty_sectors) + i_sectors_acct(c, inode, NULL, -dirty_sectors); - __bch2_put_page_reservation(c, inode, s); + bch2_page_state_release(page); } -int bch2_set_page_dirty(struct page *page) +static void bch2_set_page_dirty(struct bch_fs *c, + struct bch_inode_info *inode, struct page *page, + struct bch2_page_reservation *res, + unsigned offset, unsigned len) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct quota_res quota_res = { 0 }; - struct bch_page_state old, new; + struct bch_page_state *s = bch2_page_state(page); + unsigned i, dirty_sectors = 0; + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); - old = page_state_cmpxchg(page_state(page), new, - new.dirty_sectors = PAGE_SECTORS - new.sectors; - new.quota_reserved = 0; - ); + BUG_ON(sectors > res->disk.sectors); + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; - quota_res.sectors += old.quota_reserved * PAGE_SECTORS; + if (s->s[i].state == SECTOR_UNALLOCATED) + dirty_sectors++; - if (old.dirty_sectors != new.dirty_sectors) - i_sectors_acct(c, inode, "a_res, - new.dirty_sectors - old.dirty_sectors); - bch2_quota_reservation_put(c, inode, "a_res); + s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); + } + + if (dirty_sectors) + i_sectors_acct(c, inode, &res->quota, dirty_sectors); - return __set_page_dirty_nobuffers(page); + if (!PageDirty(page)) + __set_page_dirty_nobuffers(page); } vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) @@ -685,8 +779,13 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_page_reservation res; + unsigned len; + loff_t isize; int ret = VM_FAULT_LOCKED; + bch2_page_reservation_init(c, inode, &res); + sb_start_pagefault(inode->v.i_sb); file_update_time(file); @@ -700,26 +799,35 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) pagecache_add_get(&mapping->add_lock); lock_page(page); - if (page->mapping != mapping || - page_offset(page) > i_size_read(&inode->v)) { + isize = i_size_read(&inode->v); + + if (page->mapping != mapping || page_offset(page) >= isize) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out; } - if (bch2_get_page_reservation(c, inode, page, true)) { + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) <= isize) + len = PAGE_SIZE; + else + len = offset_in_page(isize); + + if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { unlock_page(page); ret = VM_FAULT_SIGBUS; goto out; } - if (!PageDirty(page)) - set_page_dirty(page); + bch2_set_page_dirty(c, inode, page, &res, 0, len); wait_for_stable_page(page); out: if (current->pagecache_lock != &mapping->add_lock) pagecache_add_put(&mapping->add_lock); sb_end_pagefault(inode->v.i_sb); + + bch2_page_reservation_put(c, inode, &res); + return ret; } @@ -757,53 +865,36 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; if (PagePrivate(page)) { - *page_state(newpage) = *page_state(page); ClearPagePrivate(page); + get_page(newpage); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + SetPagePrivate(newpage); } - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } #endif -/* readpages/writepages: */ - -static bool bio_can_add_page_contig(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; - - return bio->bi_vcnt < bio->bi_max_vecs && - bio_end_sector(bio) == offset; -} - -static int bio_add_page_contig(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; - - EBUG_ON(!bio->bi_max_vecs); - - if (!bio->bi_vcnt) - bio->bi_iter.bi_sector = offset; - else if (!bio_can_add_page_contig(bio, page)) - return -1; - - __bio_add_page(bio, page, PAGE_SIZE, 0); - return 0; -} - /* readpage(s): */ static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; int i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -849,7 +940,8 @@ static int readpages_iter_init(struct readpages_iter *iter, while (!list_empty(pages)) { struct page *page = list_last_entry(pages, struct page, lru); - prefetchw(&page->flags); + __bch2_page_state_create(page, __GFP_NOFAIL); + iter->pages[iter->nr_pages++] = page; list_del(&page->lru); } @@ -885,6 +977,7 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter) iter->idx++; iter->nr_added++; + __bch2_page_state_release(page); put_page(page); } @@ -895,7 +988,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter) out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); - page_state_init_for_read(iter->pages[iter->idx]); return iter->pages[iter->idx]; } @@ -903,31 +995,31 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_allocated(k); + unsigned state = k.k->type == KEY_TYPE_reservation + ? SECTOR_RESERVED + : SECTOR_ALLOCATED; bio_for_each_segment(bv, bio, iter) { - /* brand new pages, don't need to be locked: */ - - struct bch_page_state *s = page_state(bv.bv_page); - - /* sectors in @k from the start of this page: */ - unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset); - - unsigned page_sectors = min(bv.bv_len >> 9, k_sectors); - - s->nr_replicas = page_sectors == PAGE_SECTORS - ? nr_ptrs : 0; - - BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); - s->sectors += page_sectors; + struct bch_page_state *s = bch2_page_state(bv.bv_page); + unsigned i; + + for (i = bv.bv_offset >> 9; + i < (bv.bv_offset + bv.bv_len) >> 9; + i++) { + s->s[i].nr_replicas = nr_ptrs; + s->s[i].state = state; + } } } static void readpage_bio_extend(struct readpages_iter *iter, - struct bio *bio, u64 offset, + struct bio *bio, + unsigned sectors_this_extent, bool get_more) { - while (bio_end_sector(bio) < offset && + while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; struct page *page = readpage_iter_next(iter); @@ -942,23 +1034,23 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); if (!page) break; - page_state_init_for_read(page); + if (!__bch2_page_state_create(page, 0)) { + put_page(page); + break; + } ret = add_to_page_cache_lru(page, iter->mapping, page_offset, GFP_NOFS); if (ret) { - ClearPagePrivate(page); + __bch2_page_state_release(page); put_page(page); break; } @@ -966,7 +1058,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, put_page(page); } - __bio_add_page(bio, page, PAGE_SIZE, 0); + BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); } } @@ -975,71 +1067,82 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; - struct bio *bio = &rbio->bio; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + int ret = 0; rbio->c = c; rbio->start_time = local_clock(); - +retry: while (1) { BKEY_PADDED(k) tmp; struct bkey_s_c k; - unsigned bytes; + unsigned bytes, sectors, offset_into_extent; - bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector)); + bch2_btree_iter_set_pos(iter, + POS(inum, rbio->bio.bi_iter.bi_sector)); k = bch2_btree_iter_peek_slot(iter); - BUG_ON(!k.k); - - if (IS_ERR(k.k)) { - int ret = btree_iter_err(iter); - BUG_ON(!ret); - bcache_io_error(c, bio, "btree IO error %i", ret); - bio_endio(bio); - return; - } + ret = bkey_err(k); + if (ret) + break; bkey_reassemble(&tmp.k, k); - bch2_trans_unlock(trans); k = bkey_i_to_s_c(&tmp.k); + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(trans, + &offset_into_extent, &tmp.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(trans); + if (readpages_iter) { bool want_full_extent = false; if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *i; struct extent_ptr_decoded p; - extent_for_each_ptr_decode(e, p, i) + bkey_for_each_ptr_decode(k.k, ptrs, p, i) want_full_extent |= ((p.crc.csum_type != 0) | (p.crc.compression_type != 0)); } - readpage_bio_extend(readpages_iter, - bio, k.k->p.offset, - want_full_extent); + readpage_bio_extend(readpages_iter, &rbio->bio, + sectors, want_full_extent); } - bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) - - bio->bi_iter.bi_sector) << 9; - swap(bio->bi_iter.bi_size, bytes); + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); - if (bytes == bio->bi_iter.bi_size) + if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; if (bkey_extent_is_allocation(k.k)) - bch2_add_page_sectors(bio, k); + bch2_add_page_sectors(&rbio->bio, k); - bch2_read_extent(c, rbio, k, flags); + bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) return; - swap(bio->bi_iter.bi_size, bytes); - bio_advance(bio, bytes); + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); } + + if (ret == -EINTR) + goto retry; + + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); } int bch2_readpages(struct file *file, struct address_space *mapping, @@ -1080,7 +1183,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; rbio->bio.bi_end_io = bch2_readpages_end_io; - __bio_add_page(&rbio->bio, page, PAGE_SIZE, 0); + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bchfs_read(&trans, iter, rbio, inode->v.i_ino, &readpages_iter); @@ -1101,10 +1204,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, struct btree_trans trans; struct btree_iter *iter; - page_state_init_for_read(page); + bch2_page_state_create(page, __GFP_NOFAIL); bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); - bio_add_page_contig(&rbio->bio, page); + rbio->bio.bi_iter.bi_sector = + (sector_t) page->index << PAGE_SECTOR_SHIFT; + BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, @@ -1188,13 +1293,22 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.op.c; struct bio *bio = &io->op.op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i; + unsigned i, j; if (io->op.op.error) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter) { + struct bch_page_state *s; + SetPageError(bvec->bv_page); mapping_set_error(bvec->bv_page->mapping, -EIO); + + lock_page(bvec->bv_page); + s = bch2_page_state(bvec->bv_page); + for (j = 0; j < PAGE_SECTORS; j++) + s->s[j].nr_replicas = 0; + unlock_page(bvec->bv_page); } } @@ -1219,8 +1333,12 @@ static void bch2_writepage_io_done(struct closure *cl) i_sectors_acct(c, io->op.inode, NULL, io->op.sectors_added - (s64) io->new_sectors); - bio_for_each_segment_all(bvec, bio, i) - end_page_writeback(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter) { + struct bch_page_state *s = __bch2_page_state(bvec->bv_page); + + if (atomic_dec_and_test(&s->write_count)) + end_page_writeback(bvec->bv_page); + } closure_return_with_destructor(&io->cl, bch2_writepage_io_free); } @@ -1241,11 +1359,10 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) static void bch2_writepage_io_alloc(struct bch_fs *c, struct bch_writepage_state *w, struct bch_inode_info *inode, - struct page *page, + u64 sector, unsigned nr_replicas) { struct bch_write_op *op; - u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT; w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, @@ -1259,8 +1376,8 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->pos = POS(inode->v.i_ino, offset); - op->wbio.bio.bi_iter.bi_sector = offset; + op->pos = POS(inode->v.i_ino, sector); + op->wbio.bio.bi_iter.bi_sector = sector; } static int __bch2_writepage(struct page *page, @@ -1270,10 +1387,11 @@ static int __bch2_writepage(struct page *page, struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; - struct bch_page_state new, old; - unsigned offset, nr_replicas_this_write; + struct bch_page_state *s, orig; + unsigned i, offset, nr_replicas_this_write = U32_MAX; loff_t i_size = i_size_read(&inode->v); pgoff_t end_index = i_size >> PAGE_SHIFT; + int ret; EBUG_ON(!PageUptodate(page)); @@ -1297,57 +1415,104 @@ static int __bch2_writepage(struct page *page, */ zero_user_segment(page, offset, PAGE_SIZE); do_io: - EBUG_ON(!PageLocked(page)); + s = bch2_page_state_create(page, __GFP_NOFAIL); + + ret = bch2_get_page_disk_reservation(c, inode, page, true); + if (ret) { + SetPageError(page); + mapping_set_error(page->mapping, ret); + unlock_page(page); + return 0; + } + + /* Before unlocking the page, get copy of reservations: */ + orig = *s; + + for (i = 0; i < PAGE_SECTORS; i++) { + if (s->s[i].state < SECTOR_DIRTY) + continue; - /* Before unlocking the page, transfer reservation to w->io: */ - old = page_state_cmpxchg(page_state(page), new, { - /* - * If we didn't get a reservation, we can only write out the - * number of (fully allocated) replicas that currently exist, - * and only if the entire page has been written: - */ nr_replicas_this_write = - max_t(unsigned, - new.replicas_reserved, - (new.sectors == PAGE_SECTORS - ? new.nr_replicas : 0)); + min_t(unsigned, nr_replicas_this_write, + s->s[i].nr_replicas + + s->s[i].replicas_reserved); + } - BUG_ON(!nr_replicas_this_write); + for (i = 0; i < PAGE_SECTORS; i++) { + if (s->s[i].state < SECTOR_DIRTY) + continue; - new.nr_replicas = w->opts.compression - ? 0 - : nr_replicas_this_write; + s->s[i].nr_replicas = w->opts.compression + ? 0 : nr_replicas_this_write; - new.replicas_reserved = 0; + s->s[i].replicas_reserved = 0; + s->s[i].state = SECTOR_ALLOCATED; + } - new.sectors += new.dirty_sectors; - BUG_ON(new.sectors != PAGE_SECTORS); - new.dirty_sectors = 0; - }); + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); BUG_ON(PageWriteback(page)); set_page_writeback(page); + unlock_page(page); - if (w->io && - (w->io->op.op.res.nr_replicas != nr_replicas_this_write || - !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page))) - bch2_writepage_do_io(w); + offset = 0; + while (1) { + unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; + + while (offset < PAGE_SECTORS && + orig.s[offset].state < SECTOR_DIRTY) + offset++; - if (!w->io) - bch2_writepage_io_alloc(c, w, inode, page, - nr_replicas_this_write); + if (offset == PAGE_SECTORS) + break; + + sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; - w->io->new_sectors += new.sectors - old.sectors; + while (offset + sectors < PAGE_SECTORS && + orig.s[offset + sectors].state >= SECTOR_DIRTY) + sectors++; - BUG_ON(inode != w->io->op.inode); - BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page)); + for (i = offset; i < offset + sectors; i++) { + reserved_sectors += orig.s[i].replicas_reserved; + dirty_sectors += orig.s[i].state == SECTOR_DIRTY; + } - w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS; - w->io->op.new_i_size = i_size; + if (w->io && + (w->io->op.op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.op.wbio.bio) || + bio_end_sector(&w->io->op.op.wbio.bio) != sector)) + bch2_writepage_do_io(w); - if (wbc->sync_mode == WB_SYNC_ALL) - w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; + if (!w->io) + bch2_writepage_io_alloc(c, w, inode, sector, + nr_replicas_this_write); + + w->io->new_sectors += dirty_sectors; + + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->op.inode); + BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page, + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ + BUG_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c))); + + w->io->op.op.res.sectors += reserved_sectors; + w->io->op.new_i_size = i_size; + + if (wbc->sync_mode == WB_SYNC_ALL) + w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; + + offset += sectors; + } + + if (atomic_dec_and_test(&s->write_count)) + end_page_writeback(page); return 0; } @@ -1390,12 +1555,18 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_page_reservation *res; pgoff_t index = pos >> PAGE_SHIFT; unsigned offset = pos & (PAGE_SIZE - 1); struct page *page; int ret = -ENOMEM; - BUG_ON(inode_unhashed(&inode->v)); + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + bch2_page_reservation_init(c, inode, res); + *fsdata = res; /* Not strictly necessary - same reason as mkwrite(): */ pagecache_add_get(&mapping->add_lock); @@ -1427,7 +1598,8 @@ readpage: if (ret) goto err; out: - ret = bch2_get_page_reservation(c, inode, page, true); + ret = bch2_page_reservation_get(c, inode, page, res, + offset, len, true); if (ret) { if (!PageUptodate(page)) { /* @@ -1450,6 +1622,8 @@ err: *pagep = NULL; err_unlock: pagecache_add_put(&mapping->add_lock); + kfree(res); + *fsdata = NULL; return ret; } @@ -1459,6 +1633,8 @@ int bch2_write_end(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_page_reservation *res = fsdata; + unsigned offset = pos & (PAGE_SIZE - 1); lockdep_assert_held(&inode->v.i_rwsem); @@ -1481,18 +1657,19 @@ int bch2_write_end(struct file *file, struct address_space *mapping, if (copied) { if (!PageUptodate(page)) SetPageUptodate(page); - if (!PageDirty(page)) - set_page_dirty(page); + + bch2_set_page_dirty(c, inode, page, res, offset, copied); inode->ei_last_dirtied = (unsigned long) current; - } else { - bch2_put_page_reservation(c, inode, page); } unlock_page(page); put_page(page); pagecache_add_put(&mapping->add_lock); + bch2_page_reservation_put(c, inode, res); + kfree(res); + return copied; } @@ -1505,15 +1682,19 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct page *pages[WRITE_BATCH_PAGES]; + struct bch2_page_reservation res; unsigned long index = pos >> PAGE_SHIFT; unsigned offset = pos & (PAGE_SIZE - 1); unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - unsigned i, copied = 0, nr_pages_copied = 0; + unsigned i, reserved = 0, set_dirty = 0; + unsigned copied = 0, nr_pages_copied = 0; int ret = 0; BUG_ON(!len); BUG_ON(nr_pages > ARRAY_SIZE(pages)); + bch2_page_reservation_init(c, inode, &res); + for (i = 0; i < nr_pages; i++) { pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); if (!pages[i]) { @@ -1540,19 +1721,25 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } } - for (i = 0; i < nr_pages; i++) { - ret = bch2_get_page_reservation(c, inode, pages[i], true); - - if (ret && !PageUptodate(pages[i])) { - ret = bch2_read_single_page(pages[i], mapping); - if (ret) - goto out; - - ret = bch2_get_page_reservation(c, inode, pages[i], true); + while (reserved < len) { + struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; + unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); + unsigned pg_len = min_t(unsigned, len - reserved, + PAGE_SIZE - pg_offset); +retry_reservation: + ret = bch2_page_reservation_get(c, inode, page, &res, + pg_offset, pg_len, true); + + if (ret && !PageUptodate(page)) { + ret = bch2_read_single_page(page, mapping); + if (!ret) + goto retry_reservation; } if (ret) goto out; + + reserved += pg_len; } if (mapping_writably_mapped(mapping)) @@ -1562,10 +1749,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, while (copied < len) { struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); - unsigned pg_bytes = min_t(unsigned, len - copied, - PAGE_SIZE - pg_offset); + unsigned pg_len = min_t(unsigned, len - copied, + PAGE_SIZE - pg_offset); unsigned pg_copied = iov_iter_copy_from_user_atomic(page, - iter, pg_offset, pg_bytes); + iter, pg_offset, pg_len); if (!pg_copied) break; @@ -1595,23 +1782,30 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, copied -= (offset + copied) & (PAGE_SIZE - 1); } } -out: - for (i = 0; i < nr_pages_copied; i++) { - if (!PageUptodate(pages[i])) - SetPageUptodate(pages[i]); - if (!PageDirty(pages[i])) - set_page_dirty(pages[i]); - unlock_page(pages[i]); - put_page(pages[i]); - } + while (set_dirty < copied) { + struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; + unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); + unsigned pg_len = min_t(unsigned, copied - set_dirty, + PAGE_SIZE - pg_offset); + + if (!PageUptodate(page)) + SetPageUptodate(page); + + bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); + unlock_page(page); + put_page(page); + + set_dirty += pg_len; + } +out: for (i = nr_pages_copied; i < nr_pages; i++) { - if (!PageDirty(pages[i])) - bch2_put_page_reservation(c, inode, pages[i]); unlock_page(pages[i]); put_page(pages[i]); } + bch2_page_reservation_put(c, inode, &res); + return copied ?: ret; } @@ -1816,6 +2010,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct address_space *mapping = req->ki_filp->f_mapping; struct bch_inode_info *inode = dio->iop.inode; struct bio *bio = &dio->iop.op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; loff_t offset; bool sync; @@ -1893,7 +2088,7 @@ err_wait_io: closure_sync(&dio->cl); loop: - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, i, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->iop.op.error) break; @@ -2093,29 +2288,25 @@ out: /* truncate: */ -static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, - u64 start_offset, u64 end_offset, u64 *journal_seq) +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bch_inode_info *inode, + u64 new_i_size) { - struct bpos start = POS(inode->v.i_ino, start_offset); - struct bpos end = POS(inode->v.i_ino, end_offset); + struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct btree_trans trans; - struct btree_iter *iter; struct bkey_s_c k; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, - BTREE_ITER_INTENT); + int ret = 0, ret2 = 0; while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k)) && bkey_cmp(iter->pos, end) < 0) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + ret = bkey_err(k); + if (ret) + goto btree_err; + bkey_init(&delete.k); delete.k.p = iter->pos; @@ -2123,21 +2314,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); - ret = bch2_extent_update(&trans, inode, + bch2_trans_begin_updates(trans); + + ret = bch2_extent_update(trans, inode, &disk_res, NULL, iter, &delete, - 0, true, true, NULL); + new_i_size, false, true, NULL); bch2_disk_reservation_put(c, &disk_res); - - if (ret == -EINTR) +btree_err: + if (ret == -EINTR) { + ret2 = ret; ret = 0; + } if (ret) break; + } - bch2_trans_cond_resched(&trans); + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); } + return ret ?: ret2; +} + +static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, + u64 start_offset, u64 end_offset) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, start_offset), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, + POS(inode->v.i_ino, end_offset), + inode, 0); + bch2_trans_exit(&trans); + if (ret == -EINTR) + ret = 0; + return ret; } @@ -2170,8 +2391,10 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; + struct bch_page_state *s; unsigned start_offset = start & (PAGE_SIZE - 1); unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + unsigned i; struct page *page; int ret = 0; @@ -2203,31 +2426,42 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, } } + s = bch2_page_state_create(page, 0); + if (!s) { + ret = -ENOMEM; + goto unlock; + } + if (!PageUptodate(page)) { ret = bch2_read_single_page(page, mapping); if (ret) goto unlock; } + if (index != start >> PAGE_SHIFT) + start_offset = 0; + if (index != end >> PAGE_SHIFT) + end_offset = PAGE_SIZE; + + for (i = round_up(start_offset, block_bytes(c)) >> 9; + i < round_down(end_offset, block_bytes(c)) >> 9; + i++) { + s->s[i].nr_replicas = 0; + s->s[i].state = SECTOR_UNALLOCATED; + } + + zero_user_segment(page, start_offset, end_offset); + /* * Bit of a hack - we don't want truncate to fail due to -ENOSPC. * * XXX: because we aren't currently tracking whether the page has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - ret = bch2_get_page_reservation(c, inode, page, false); + ret = bch2_get_page_disk_reservation(c, inode, page, false); BUG_ON(ret); - if (index == start >> PAGE_SHIFT && - index == end >> PAGE_SHIFT) - zero_user_segment(page, start_offset, end_offset); - else if (index == start >> PAGE_SHIFT) - zero_user_segment(page, start_offset, PAGE_SIZE); - else if (index == end >> PAGE_SHIFT) - zero_user_segment(page, 0, end_offset); - - if (!PageDirty(page)) - set_page_dirty(page); + __set_page_dirty_nobuffers(page); unlock: unlock_page(page); put_page(page); @@ -2238,7 +2472,7 @@ out: static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) { return __bch2_truncate_page(inode, from >> PAGE_SHIFT, - from, from + PAGE_SIZE); + from, round_up(from, PAGE_SIZE)); } static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) @@ -2308,6 +2542,16 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (unlikely(ret)) goto err; + /* + * When extending, we're going to write the new i_size to disk + * immediately so we need to flush anything above the current on disk + * i_size first: + * + * Also, when extending we need to flush the page that i_size currently + * straddles - if it's mapped to userspace, we need to ensure that + * userspace has to redirty it and call .mkwrite -> set_page_dirty + * again to allocate the part of the page that was extended. + */ if (iattr->ia_size > inode->ei_inode.bi_size) ret = filemap_write_and_wait_range(mapping, inode->ei_inode.bi_size, @@ -2329,13 +2573,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) truncate_setsize(&inode->v, iattr->ia_size); - /* - * XXX: need a comment explaining why PAGE_SIZE and not block_bytes() - * here: - */ ret = __bch2_fpunch(c, inode, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - U64_MAX, &inode->ei_journal_seq); + round_up(iattr->ia_size, block_bytes(c)) >> 9, + U64_MAX); if (unlikely(ret)) goto err; @@ -2356,8 +2596,8 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; - u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; + u64 discard_start = round_up(offset, block_bytes(c)) >> 9; + u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; int ret = 0; inode_lock(&inode->v); @@ -2382,8 +2622,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); if (discard_start < discard_end) - ret = __bch2_fpunch(c, inode, discard_start, discard_end, - &inode->ei_journal_seq); + ret = __bch2_fpunch(c, inode, discard_start, discard_end); err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); @@ -2391,16 +2630,16 @@ err: return ret; } -static long bch2_fcollapse(struct bch_inode_info *inode, - loff_t offset, loff_t len) +static long bch2_fcollapse_finsert(struct bch_inode_info *inode, + loff_t offset, loff_t len, + bool insert) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct btree_trans trans; - struct btree_iter *src, *dst; - BKEY_PADDED(k) copy; - struct bkey_s_c k; - loff_t new_size; + struct btree_iter *src, *dst, *del = NULL; + loff_t shift, new_size; + u64 src_start; int ret; if ((offset | len) & (block_bytes(c) - 1)) @@ -2418,88 +2657,188 @@ static long bch2_fcollapse(struct bch_inode_info *inode, inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - ret = -EINVAL; - if (offset + len >= inode->v.i_size) - goto err; + if (insert) { + ret = -EFBIG; + if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) + goto err; - if (inode->v.i_size < len) - goto err; + ret = -EINVAL; + if (offset >= inode->v.i_size) + goto err; + + src_start = U64_MAX; + shift = len; + } else { + ret = -EINVAL; + if (offset + len >= inode->v.i_size) + goto err; + + src_start = offset + len; + shift = -len; + } - new_size = inode->v.i_size - len; + new_size = inode->v.i_size + shift; ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) goto err; - dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - BUG_ON(IS_ERR_OR_NULL(dst)); + if (insert) { + i_size_write(&inode->v, new_size); + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + } else { + ret = __bch2_fpunch(c, inode, offset >> 9, + (offset + len) >> 9); + if (ret) + goto err; + } src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_SLOTS); + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); BUG_ON(IS_ERR_OR_NULL(src)); - while (bkey_cmp(dst->pos, - POS(inode->v.i_ino, - round_up(new_size, PAGE_SIZE) >> 9)) < 0) { - struct disk_reservation disk_res; + dst = bch2_trans_copy_iter(&trans, src); + BUG_ON(IS_ERR_OR_NULL(dst)); + + while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + BKEY_PADDED(k) copy; + struct bkey_i delete; + struct bkey_s_c k; + struct bpos next_pos; + struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); + struct bpos atomic_end; + unsigned commit_flags = BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE; + + k = insert + ? bch2_btree_iter_peek_prev(src) + : bch2_btree_iter_peek(src); + if ((ret = bkey_err(k))) + goto bkey_err; + + if (!k.k || k.k->p.inode != inode->v.i_ino) + break; + + BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); + + if (insert && + bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + break; +reassemble: + bkey_reassemble(©.k, k); + + if (insert && + bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { + bch2_cut_front(move_pos, ©.k); + bch2_btree_iter_set_pos(src, bkey_start_pos(©.k.k)); + } + + copy.k.k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); ret = bch2_btree_iter_traverse(dst); if (ret) goto bkey_err; - bch2_btree_iter_set_pos(src, - POS(dst->pos.inode, dst->pos.offset + (len >> 9))); - - k = bch2_btree_iter_peek_slot(src); - if ((ret = bkey_err(k))) + ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); + if (ret) goto bkey_err; - bkey_reassemble(©.k, k); + if (bkey_cmp(atomic_end, copy.k.k.p)) { + if (insert) { + move_pos = atomic_end; + move_pos.offset -= shift >> 9; + goto reassemble; + } else { + bch2_cut_back(atomic_end, ©.k.k); + } + } - bch2_cut_front(src->pos, ©.k); - copy.k.k.p.offset -= len >> 9; + bkey_init(&delete.k); + delete.k.p = src->pos; + bch2_key_resize(&delete.k, copy.k.k.size); + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; - bch2_extent_trim_atomic(©.k, dst); + /* + * If the new and old keys overlap (because we're moving an + * extent that's bigger than the amount we're collapsing by), + * we need to trim the delete key here so they don't overlap + * because overlaps on insertions aren't handled before + * triggers are run, so the overwrite will get double counted + * by the triggers machinery: + */ + if (insert && + bkey_cmp(bkey_start_pos(©.k.k), delete.k.p) < 0) { + bch2_cut_back(bkey_start_pos(©.k.k), &delete.k); + } else if (!insert && + bkey_cmp(copy.k.k.p, + bkey_start_pos(&delete.k)) > 0) { + bch2_cut_front(copy.k.k.p, &delete); + + del = bch2_trans_copy_iter(&trans, src); + BUG_ON(IS_ERR_OR_NULL(del)); + + bch2_btree_iter_set_pos(del, + bkey_start_pos(&delete.k)); + } - BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k))); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, ©.k)); + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(del ?: src, &delete)); - ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size, - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)), - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); + if (copy.k.k.size == k.k->size) { + /* + * If we're moving the entire extent, we can skip + * running triggers: + */ + commit_flags |= BTREE_INSERT_NOMARK; + } else { + /* We might end up splitting compressed extents: */ + unsigned nr_ptrs = + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)); + + ret = bch2_disk_reservation_get(c, &disk_res, + copy.k.k.size, nr_ptrs, + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + } - ret = bch2_extent_update(&trans, inode, - &disk_res, NULL, - dst, ©.k, - 0, true, true, NULL); + ret = bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, + commit_flags); bch2_disk_reservation_put(c, &disk_res); bkey_err: + if (del) + bch2_trans_iter_free(&trans, del); + del = NULL; + + if (!ret) + bch2_btree_iter_set_pos(src, next_pos); + if (ret == -EINTR) ret = 0; if (ret) goto err; - /* - * XXX: if we error here we've left data with multiple - * pointers... which isn't a _super_ serious problem... - */ bch2_trans_cond_resched(&trans); } bch2_trans_unlock(&trans); - ret = __bch2_fpunch(c, inode, - round_up(new_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq); - if (ret) - goto err; - - i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, new_size, - ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); + if (!insert) { + i_size_write(&inode->v, new_size); + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + } err: bch2_trans_exit(&trans); pagecache_block_put(&mapping->add_lock); @@ -2515,8 +2854,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, struct btree_trans trans; struct btree_iter *iter; struct bpos end_pos; - loff_t block_start, block_end; - loff_t end = offset + len; + loff_t end = offset + len; + loff_t block_start = round_down(offset, block_bytes(c)); + loff_t block_end = round_up(end, block_bytes(c)); unsigned sectors; unsigned replicas = io_opts(c, inode).data_replicas; int ret; @@ -2548,12 +2888,6 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, goto err; truncate_pagecache_range(&inode->v, offset, end - 1); - - block_start = round_up(offset, PAGE_SIZE); - block_end = round_down(end, PAGE_SIZE); - } else { - block_start = round_down(offset, PAGE_SIZE); - block_end = round_up(end, PAGE_SIZE); } iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -2613,6 +2947,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } + bch2_trans_begin_updates(&trans); + ret = bch2_extent_update(&trans, inode, &disk_res, "a_res, iter, &reservation.k_i, @@ -2671,43 +3007,157 @@ long bch2_fallocate_dispatch(struct file *file, int mode, if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) return bch2_fpunch(inode, offset, len); + if (mode == FALLOC_FL_INSERT_RANGE) + return bch2_fcollapse_finsert(inode, offset, len, true); + if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bch2_fcollapse(inode, offset, len); + return bch2_fcollapse_finsert(inode, offset, len, false); return -EOPNOTSUPP; } +static void mark_range_unallocated(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SHIFT; + struct pagevec pvec; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct bch_page_state *s; + + lock_page(page); + s = bch2_page_state(page); + + if (s) + for (j = 0; j < PAGE_SECTORS; j++) + s->s[j].nr_replicas = 0; + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); +} + +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +{ + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + loff_t ret = 0; + loff_t aligned_len; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if ((pos_src & (block_bytes(c) - 1)) || + (pos_dst & (block_bytes(c) - 1))) + return -EINVAL; + + if (src == dst && + abs(pos_src - pos_dst) < len) + return -EINVAL; + + bch2_lock_inodes(INODE_LOCK, src, dst); + + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + + __pagecache_block_get(&src->v.i_mapping->add_lock); + __pagecache_block_get(&dst->v.i_mapping->add_lock); + + ret = generic_remap_file_range_prep(file_src, pos_src, + file_dst, pos_dst, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + aligned_len = round_up(len, block_bytes(c)); + + ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + aligned_len); + if (ret) + goto out_unlock; + + mark_range_unallocated(src, pos_src, pos_src + aligned_len); + + ret = bch2_remap_range(c, dst, + POS(dst->v.i_ino, pos_dst >> 9), + POS(src->v.i_ino, pos_src >> 9), + aligned_len >> 9, + pos_dst + len); + if (ret > 0) + ret = min(ret << 9, len); + +out_unlock: + __pagecache_block_put(&dst->v.i_mapping->add_lock); + __pagecache_block_put(&src->v.i_mapping->add_lock); + + bch2_unlock_inodes(INODE_LOCK, src, dst); + + return ret; +} + /* fseek: */ -static bool page_is_data(struct page *page) +static int page_data_offset(struct page *page, unsigned offset) { - EBUG_ON(!PageLocked(page)); + struct bch_page_state *s = bch2_page_state(page); + unsigned i; + + if (s) + for (i = offset >> 9; i < PAGE_SECTORS; i++) + if (s->s[i].state >= SECTOR_DIRTY) + return i << 9; - /* XXX: should only have to check PageDirty */ - return PagePrivate(page) && - (page_state(page)->sectors || - page_state(page)->dirty_sectors); + return -1; } -static loff_t bch2_next_pagecache_data(struct inode *vinode, +static loff_t bch2_seek_pagecache_data(struct inode *vinode, loff_t start_offset, loff_t end_offset) { struct address_space *mapping = vinode->i_mapping; struct page *page; - pgoff_t index; - - for (index = start_offset >> PAGE_SHIFT; - index < end_offset >> PAGE_SHIFT; - index++) { - if (find_get_pages(mapping, &index, 1, &page)) { + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + loff_t ret; + int offset; + + while (index <= end_index) { + if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { lock_page(page); - if (page_is_data(page)) - end_offset = - min(end_offset, - max(start_offset, - ((loff_t) index) << PAGE_SHIFT)); + offset = page_data_offset(page, + page->index == start_index + ? start_offset & (PAGE_SIZE - 1) + : 0); + if (offset >= 0) { + ret = clamp(((loff_t) page->index << PAGE_SHIFT) + + offset, + start_offset, end_offset); + unlock_page(page); + put_page(page); + return ret; + } + unlock_page(page); put_page(page); } else { @@ -2750,43 +3200,65 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return ret; if (next_data > offset) - next_data = bch2_next_pagecache_data(&inode->v, + next_data = bch2_seek_pagecache_data(&inode->v, offset, next_data); - if (next_data > isize) + if (next_data >= isize) return -ENXIO; return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } -static bool page_slot_is_data(struct address_space *mapping, pgoff_t index) +static int __page_hole_offset(struct page *page, unsigned offset) { + struct bch_page_state *s = bch2_page_state(page); + unsigned i; + + if (!s) + return 0; + + for (i = offset >> 9; i < PAGE_SECTORS; i++) + if (s->s[i].state < SECTOR_DIRTY) + return i << 9; + + return -1; +} + +static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) +{ + pgoff_t index = offset >> PAGE_SHIFT; struct page *page; - bool ret; + int pg_offset; + loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) - return false; + if (!page || xa_is_value(page)) + return offset; + + pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); + if (pg_offset >= 0) + ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; - ret = page_is_data(page); unlock_page(page); return ret; } -static loff_t bch2_next_pagecache_hole(struct inode *vinode, +static loff_t bch2_seek_pagecache_hole(struct inode *vinode, loff_t start_offset, loff_t end_offset) { struct address_space *mapping = vinode->i_mapping; - pgoff_t index; + loff_t offset = start_offset, hole; - for (index = start_offset >> PAGE_SHIFT; - index < end_offset >> PAGE_SHIFT; - index++) - if (!page_slot_is_data(mapping, index)) - end_offset = max(start_offset, - ((loff_t) index) << PAGE_SHIFT); + while (offset < end_offset) { + hole = page_hole_offset(mapping, offset); + if (hole >= 0 && hole <= end_offset) + return max(start_offset, hole); + + offset += PAGE_SIZE; + offset &= PAGE_MASK; + } return end_offset; } @@ -2811,11 +3283,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) POS(inode->v.i_ino, offset >> 9), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_next_pagecache_hole(&inode->v, + next_hole = bch2_seek_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE); break; } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_next_pagecache_hole(&inode->v, + next_hole = bch2_seek_pagecache_hole(&inode->v, max(offset, bkey_start_offset(k.k) << 9), k.k->p.offset << 9); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 88060b8785c3..a35732327e91 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -9,7 +9,21 @@ #include <linux/uio.h> -int bch2_set_page_dirty(struct page *); +struct quota_res; + +int bch2_extent_update(struct btree_trans *, + struct bch_inode_info *, + struct disk_reservation *, + struct quota_res *, + struct btree_iter *, + struct bkey_i *, + u64, bool, bool, s64 *); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_inode_info *, u64); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); @@ -32,6 +46,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); +loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, + loff_t, loff_t, unsigned); + loff_t bch2_llseek(struct file *, loff_t, int); vm_fault_t bch2_page_mkwrite(struct vm_fault *); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 615b0be8b468..16017079157f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1068,16 +1068,20 @@ static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) return 0; } -static int bch2_fill_extent(struct fiemap_extent_info *info, - const struct bkey_i *k, unsigned flags) +static int bch2_fill_extent(struct bch_fs *c, + struct fiemap_extent_info *info, + struct bkey_s_c k, unsigned flags) { - if (bkey_extent_is_data(&k->k)) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + if (bkey_extent_is_data(k.k)) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; int ret; - extent_for_each_ptr_decode(e, p, entry) { + if (k.k->type == KEY_TYPE_reflink_v) + flags |= FIEMAP_EXTENT_SHARED; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int flags2 = 0; u64 offset = p.ptr.offset; @@ -1086,23 +1090,23 @@ static int bch2_fill_extent(struct fiemap_extent_info *info, else offset += p.crc.offset; - if ((offset & (PAGE_SECTORS - 1)) || - (e.k->size & (PAGE_SECTORS - 1))) + if ((offset & (c->opts.block_size - 1)) || + (k.k->size & (c->opts.block_size - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, - bkey_start_offset(e.k) << 9, + bkey_start_offset(k.k) << 9, offset << 9, - e.k->size << 9, flags|flags2); + k.k->size << 9, flags|flags2); if (ret) return ret; } return 0; - } else if (k->k.type == KEY_TYPE_reservation) { + } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, - bkey_start_offset(&k->k) << 9, - 0, k->k.size << 9, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); @@ -1119,7 +1123,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(k) tmp; + BKEY_PADDED(k) cur, prev; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); + unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; @@ -1128,26 +1134,63 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(ei->v.i_ino, start >> 9), 0, k, ret) - if (bkey_extent_is_data(k.k) || - k.k->type == KEY_TYPE_reservation) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(ei->v.i_ino, (start + len) >> 9)) >= 0) - break; + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(ei->v.i_ino, start >> 9), 0); +retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter->pos, end) < 0) { + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + bch2_btree_iter_next(iter); + continue; + } - if (have_extent) { - ret = bch2_fill_extent(info, &tmp.k, 0); - if (ret) - break; - } + bkey_reassemble(&cur.k, k); + k = bkey_i_to_s_c(&cur.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; - bkey_reassemble(&tmp.k, k); - have_extent = true; + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &cur.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + if (offset_into_extent) + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), + &cur.k); + bch2_key_resize(&cur.k.k, sectors); + cur.k.k.p = iter->pos; + cur.k.k.p.offset += cur.k.k.size; + + if (have_extent) { + ret = bch2_fill_extent(c, info, + bkey_i_to_s_c(&prev.k), 0); + if (ret) + break; } + bkey_copy(&prev.k, &cur.k); + have_extent = true; + + if (k.k->type == KEY_TYPE_reflink_v) + bch2_btree_iter_set_pos(iter, k.k->p); + else + bch2_btree_iter_next(iter); + } + + if (ret == -EINTR) + goto retry; + if (!ret && have_extent) - ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), + FIEMAP_EXTENT_LAST); ret = bch2_trans_exit(&trans) ?: ret; return ret < 0 ? ret : 0; @@ -1196,6 +1239,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1266,7 +1310,7 @@ static const struct address_space_operations bch_address_space_operations = { .readpage = bch2_readpage, .writepages = bch2_writepages, .readpages = bch2_readpages, - .set_page_dirty = bch2_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, .invalidatepage = bch2_invalidatepage, @@ -1412,12 +1456,6 @@ static int bch2_vfs_write_inode(struct inode *vinode, ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - if (c->opts.journal_flush_disabled) - return ret; - - if (!ret && wbc->sync_mode == WB_SYNC_ALL) - ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq); - return ret; } @@ -1474,6 +1512,9 @@ static int bch2_sync_fs(struct super_block *sb, int wait) { struct bch_fs *c = sb->s_fs_info; + if (c->opts.journal_flush_disabled) + return 0; + if (!wait) { bch2_journal_flush_async(&c->journal, NULL); return 0; @@ -1712,9 +1753,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, goto out; } - /* XXX: blocksize */ - sb->s_blocksize = PAGE_SIZE; - sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_blocksize = block_bytes(c); + sb->s_blocksize_bits = ilog2(block_bytes(c)); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &bch_super_operations; sb->s_export_op = &bch_export_ops; @@ -1734,7 +1774,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e3738757b6a0..50a7d8c1faba 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -509,7 +509,7 @@ retry: if (fsck_err_on(w.have_inode && !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c, + k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, "extent type %u offset %llu past end of inode %llu, i_size %llu", k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { bch2_trans_unlock(&trans); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 841261b79f43..a9eda1b92b01 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -122,23 +122,23 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, i, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; } -static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio, - bool *using_mempool) +static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++]; + struct page *page; if (likely(!*using_mempool)) { - bv->bv_page = alloc_page(GFP_NOIO); - if (unlikely(!bv->bv_page)) { + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) { mutex_lock(&c->bio_bounce_pages_lock); *using_mempool = true; goto pool_alloc; @@ -146,57 +146,29 @@ static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio, } } else { pool_alloc: - bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); } - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; + return page; } void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t bytes) + size_t size) { bool using_mempool = false; - BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs); + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); + unsigned len = min(PAGE_SIZE, size); - bio->bi_iter.bi_size = bytes; - - while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) - bch2_bio_alloc_page_pool(c, bio, &using_mempool); + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; + } if (using_mempool) mutex_unlock(&c->bio_bounce_pages_lock); } -void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio, - size_t bytes) -{ - while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - - BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); - - bv->bv_page = alloc_page(GFP_NOIO); - if (!bv->bv_page) { - /* - * We already allocated from mempool, we can't allocate from it again - * without freeing the pages we already allocated or else we could - * deadlock: - */ - bch2_bio_free_pages_pool(c, bio); - bch2_bio_alloc_pages_pool(c, bio, bytes); - return; - } - - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; - bio->bi_vcnt++; - } - - bio->bi_iter.bi_size = bytes; -} - /* Writes */ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -287,6 +259,8 @@ int bch2_write_index_default(struct bch_write_op *op) bch2_verify_keylist_sorted(keys); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); +retry: + bch2_trans_begin(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(&bch2_keylist_front(keys)->k), @@ -297,7 +271,9 @@ int bch2_write_index_default(struct bch_write_op *op) bkey_copy(&split.k, bch2_keylist_front(keys)); - bch2_extent_trim_atomic(&split.k, iter); + ret = bch2_extent_trim_atomic(&split.k, iter); + if (ret) + break; bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k)); @@ -314,6 +290,11 @@ int bch2_write_index_default(struct bch_write_op *op) bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); + if (ret == -EINTR) { + ret = 0; + goto retry; + } + bch2_trans_exit(&trans); return ret; @@ -454,7 +435,7 @@ static void init_append_extent(struct bch_write_op *op, p.ptr.cached = !ca->mi.durability || (op->flags & BCH_WRITE_CACHED) != 0; p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; - bch2_extent_ptr_decoded_append(e, &p); + bch2_extent_ptr_decoded_append(&e->k_i, &p); BUG_ON(crc.compressed_size > ob->sectors_free); ob->sectors_free -= crc.compressed_size; @@ -473,7 +454,10 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct bio *bio; unsigned output_available = min(wp->sectors_free << 9, src->bi_iter.bi_size); - unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE); + unsigned pages = DIV_ROUND_UP(output_available + + (buf + ? ((unsigned long) buf & (PAGE_SIZE - 1)) + : 0), PAGE_SIZE); bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); @@ -482,8 +466,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, wbio->bio.bi_opf = src->bi_opf; if (buf) { - bio->bi_iter.bi_size = output_available; - bch2_bio_map(bio, buf); + bch2_bio_map(bio, buf, output_available); return bio; } @@ -493,31 +476,17 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: */ - while (bio->bi_iter.bi_size < output_available) { - unsigned len = min_t(unsigned, PAGE_SIZE, - output_available - bio->bi_iter.bi_size); - struct page *p; - - p = alloc_page(GFP_NOIO); - if (!p) { - unsigned pool_max = - min_t(unsigned, output_available, - c->sb.encoded_extent_max << 9); - - if (bio_sectors(bio) < pool_max) - bch2_bio_alloc_pages_pool(c, bio, pool_max); - break; - } + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, + c->sb.encoded_extent_max << 9)); - bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { - .bv_page = p, - .bv_len = len, - .bv_offset = 0, - }; - bio->bi_iter.bi_size += len; - } + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = + bch2_bio_alloc_pages(bio, + output_available - + bio->bi_iter.bi_size, + GFP_NOFS) != 0; - *page_alloc_failed = bio->bi_vcnt < pages; return bio; } @@ -821,12 +790,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) } dst->bi_iter.bi_size = total_output; - - /* Free unneeded pages after compressing: */ - if (to_wbio(dst)->bounce) - while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) - mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, - &c->bio_bounce_pages); do_write: /* might have done a realloc... */ @@ -952,30 +915,39 @@ flush_io: void bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + if (bio_sectors(bio) & (c->opts.block_size - 1)) { + __bcache_io_error(c, "misaligned write"); + op->error = -EIO; + goto err; + } + op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(&op->wbio.bio)->put_bio = false; + wbio_init(bio)->put_bio = false; if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { __bcache_io_error(c, "read only"); op->error = -EROFS; - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); - closure_return(cl); - return; + goto err; } - bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE); + bch2_increment_clock(c, bio_sectors(bio), WRITE); continue_at_nobarrier(cl, __bch2_write, NULL); + return; +err: + if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) + bch2_disk_reservation_put(c, &op->res); + closure_return(cl); } /* Cache promotion on read */ @@ -1003,17 +975,13 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts opts, unsigned flags) { - if (!bkey_extent_is_data(k.k)) - return false; - if (!(flags & BCH_READ_MAY_PROMOTE)) return false; if (!opts.promote_target) return false; - if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), - opts.promote_target)) + if (bch2_bkey_has_target(c, k, opts.promote_target)) return false; if (bch2_target_congested(c, opts.promote_target)) { @@ -1077,25 +1045,22 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, - unsigned rbio_sectors, + unsigned sectors, struct bch_read_bio **rbio) { struct promote_op *op = NULL; struct bio *bio; - unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS); - /* data might have to be decompressed in the write path: */ - unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size, - PAGE_SECTORS); + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; if (!percpu_ref_tryget(&c->writes)) return NULL; - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages, - GFP_NOIO); + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); if (!op) goto err; @@ -1103,37 +1068,32 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, op->pos = pos; /* - * promotes require bouncing, but if the extent isn't - * checksummed/compressed it might be too big for the mempool: + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: */ - if (rbio_sectors > c->sb.encoded_extent_max) { - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * rbio_pages, - GFP_NOIO); - if (!*rbio) - goto err; - - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, - rbio_pages); + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOIO); + if (!*rbio) + goto err; - (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9; - bch2_bio_map(&(*rbio)->bio, NULL); + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); - if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO)) - goto err; + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOIO)) + goto err; - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; - } + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) goto err; bio = &op->write.op.wbio.bio; - bio_init(bio, bio->bi_inline_vecs, wbio_pages); + bio_init(bio, bio->bi_inline_vecs, pages); ret = bch2_migrate_write_init(c, &op->write, writepoint_hashed((unsigned long) current), @@ -1142,6 +1102,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (struct data_opts) { .target = opts.promote_target }, + btree_id, bkey_s_c_null); BUG_ON(ret); @@ -1167,8 +1128,9 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c, bool *read_full) { bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full - ? pick->crc.compressed_size + ? max(pick->crc.compressed_size, pick->crc.live_size) : bvec_iter_sectors(iter); struct bpos pos = promote_full ? bkey_start_pos(k.k) @@ -1178,7 +1140,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); + promote = __promote_alloc(c, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_REFLINK + : BTREE_ID_EXTENTS, + pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1244,10 +1210,15 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) return rbio; } +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ static void bch2_rbio_done(struct bch_read_bio *rbio) { - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); bio_endio(&rbio->bio); } @@ -1279,17 +1250,16 @@ retry: k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - if (!bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset)) { + if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset)) { /* extent we wanted to read no longer exists: */ rbio->hole = true; goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) @@ -1312,26 +1282,40 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; + + bch2_trans_init(&trans, c, 0, 0); retry: + bch2_trans_begin(&trans); + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; - unsigned bytes; + unsigned bytes, sectors, offset_into_extent; bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &tmp.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + bch2_trans_unlock(&trans); - bytes = min_t(unsigned, bvec_iter.bi_size, - (k.k->p.offset - bvec_iter.bi_sector) << 9); + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, + offset_into_extent, failed, flags); switch (ret) { case READ_RETRY: goto retry; @@ -1412,7 +1396,6 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_i_extent *e; BKEY_PADDED(k) new; struct bch_extent_crc_unpacked new_crc; u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; @@ -1431,34 +1414,30 @@ retry: if (IS_ERR_OR_NULL(k.k)) goto out; - if (!bkey_extent_is_data(k.k)) - goto out; - bkey_reassemble(&new.k, k); - e = bkey_i_to_extent(&new.k); + k = bkey_i_to_s_c(&new.k); - if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), - rbio->pick.ptr, data_offset) || - bversion_cmp(e->k.version, rbio->version)) + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; /* Extent was merged? */ - if (bkey_start_offset(&e->k) < data_offset || - e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size) + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(&e->k) - data_offset, e->k.size, + bkey_start_offset(k.k) - data_offset, k.k->size, rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); goto out; } - if (!bch2_extent_narrow_crcs(e, new_crc)) + if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) goto out; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i)); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k)); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -1469,15 +1448,6 @@ out: bch2_trans_exit(&trans); } -static bool should_narrow_crcs(struct bkey_s_c k, - struct extent_ptr_decoded *pick, - unsigned flags) -{ - return !(flags & BCH_READ_IN_RETRY) && - bkey_extent_is_data(k.k) && - bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc); -} - /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { @@ -1512,7 +1482,7 @@ static void __bch2_read_endio(struct work_struct *work) goto nodecode; /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset; + crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); if (crc.compression_type != BCH_COMPRESSION_NONE) { @@ -1621,8 +1591,47 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_i *orig_k) +{ + struct btree_iter *iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + + *offset_into_extent; + + iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + POS(0, reflink_offset), + BTREE_ITER_SLOTS, 1); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v) { + __bcache_io_error(trans->c, + "pointer to nonexistent indirect extent"); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + bkey_reassemble(orig_k, k); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, + unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { struct extent_ptr_decoded pick; @@ -1655,7 +1664,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) goto hole; - iter.bi_sector = pos.offset; iter.bi_size = pick.crc.compressed_size << 9; goto noclone; } @@ -1664,13 +1672,13 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_MUST_CLONE; - narrow_crcs = should_narrow_crcs(k, &pick, flags); + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || - k.k->p.offset < bvec_iter_end_sector(iter)); + BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (pick.crc.compression_type != BCH_COMPRESSION_NONE || (pick.crc.csum_type != BCH_CSUM_NONE && @@ -1691,19 +1699,30 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || bvec_iter_sectors(iter) != pick.crc.live_size || pick.crc.offset || - iter.bi_sector != pos.offset)); + offset_into_extent)); + pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + - (iter.bi_sector - pos.offset); + offset_into_extent; + offset_into_extent = 0; pick.crc.compressed_size = bvec_iter_sectors(iter); pick.crc.uncompressed_size = bvec_iter_sectors(iter); pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); - pos.offset = iter.bi_sector; + offset_into_extent = 0; } if (rbio) { - /* promote already allocated bounce rbio */ + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + BUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; } else if (bounce) { unsigned sectors = pick.crc.compressed_size; @@ -1745,6 +1764,7 @@ noclone: else rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); rbio->narrow_crcs = narrow_crcs; @@ -1863,45 +1883,67 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_SLOTS); + + while (1) { BKEY_PADDED(k) tmp; - unsigned bytes; + unsigned bytes, sectors, offset_into_extent; + + bch2_btree_iter_set_pos(iter, + POS(inode, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &tmp.k); + if (ret) + goto err; + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); /* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, - (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(c, rbio, k, flags); + bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) - return; + break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - +out: bch2_trans_exit(&trans); + return; +err: + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); bch2_rbio_done(rbio); + goto out; } void bch2_fs_io_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index fe82c8b81ca5..80b72dbf1a0c 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -13,7 +13,6 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t); void bch2_latency_acct(struct bch_dev *, u64, int); @@ -96,9 +95,17 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; -int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c, struct bch_io_failures *, unsigned); -void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); +int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_i *); + +static inline int bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_reflink_p + ? __bch2_read_indirect_extent(trans, offset_into_extent, k) + : 0; +} enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, @@ -113,14 +120,22 @@ enum bch_read_flags { BCH_READ_IN_RETRY = 1 << 7, }; +int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, + struct bvec_iter, struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + static inline void bch2_read_extent(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags); + __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, + offset_into_extent, NULL, flags); } +void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); + static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_io_opts opts) { diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index 04f6d9a7c9a2..2d397e5e5b9e 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -38,6 +38,8 @@ struct bch_read_bio { */ struct bvec_iter bvec_iter; + unsigned offset_into_extent; + u16 flags; union { struct { diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index af135e263a3f..387377dadab5 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -495,9 +495,8 @@ reread: sectors_read << 9)); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = offset; - bio->bi_iter.bi_size = sectors_read << 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch2_bio_map(bio, buf->data); + bch2_bio_map(bio, buf->data, sectors_read << 9); ret = submit_bio_wait(bio); bio_put(bio); @@ -1087,12 +1086,11 @@ void bch2_journal_write(struct closure *cl) bio_reset(bio); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = ptr->offset; - bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); - bch2_bio_map(bio, jset); + bch2_bio_map(bio, jset, sectors << 9); trace_journal_write(bio); closure_bio_submit(bio, cl); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ad41f5e36a7c..dc3b03d6e627 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, + enum btree_id btree_id) { struct btree_trans trans; struct btree_iter *iter; @@ -44,13 +45,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, + BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { - if (!bkey_extent_is_data(k.k) || - !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) { + if (!bch2_bkey_has_device(k, dev_idx)) { ret = bch2_mark_bkey_replicas(c, k); if (ret) break; @@ -99,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: + __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); +} + static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 4c82b345b350..0429341ef6fb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -64,13 +64,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - struct bkey_i_extent *insert, *new = + struct bkey_i *insert; + struct bkey_i_extent *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; const union bch_extent_entry *entry; @@ -83,32 +84,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op) break; if (bversion_cmp(k.k->version, new->k.version) || - !bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k), - m->ptr, m->offset)) + !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) goto nomatch; if (m->data_cmd == DATA_REWRITE && - !bch2_extent_has_device(bkey_s_c_to_extent(k), - m->data_opts.rewrite_dev)) + !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) goto nomatch; bkey_reassemble(&_insert.k, k); - insert = bkey_i_to_extent(&_insert.k); + insert = &_insert.k; bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); - bch2_cut_front(iter->pos, &insert->k_i); + bch2_cut_front(iter->pos, insert); bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); if (m->data_cmd == DATA_REWRITE) - bch2_bkey_drop_device(extent_i_to_s(insert).s, + bch2_bkey_drop_device(bkey_i_to_s(insert), m->data_opts.rewrite_dev); extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -124,18 +122,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op) if (!did_work) goto nomatch; - bch2_extent_narrow_crcs(insert, + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, extent_i_to_s(insert).s); - bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), - op->opts.background_target, - op->opts.data_replicas); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), + op->opts.background_target, + op->opts.data_replicas); /* * If we're not fully overwriting @k, and it's compressed, we * need a reservation for all the pointers in @insert */ - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - + nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - m->nr_ptrs_reserved; if (insert->k.size < k.k->size && @@ -151,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) } bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &insert->k_i)); + BTREE_INSERT_ENTRY(iter, insert)); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), @@ -216,10 +214,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, struct bch_io_opts io_opts, enum data_cmd data_cmd, struct data_opts data_opts, + enum btree_id btree_id, struct bkey_s_c k) { int ret; + m->btree_id = btree_id; m->data_cmd = data_cmd; m->data_opts = data_opts; m->nr_ptrs_reserved = 0; @@ -267,11 +267,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, break; } case DATA_REWRITE: { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned compressed_sectors = 0; - extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && p.crc.compression_type != BCH_COMPRESSION_NONE && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) @@ -301,12 +302,13 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; int i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -394,14 +396,16 @@ static int bch2_move_extent(struct bch_fs *c, struct moving_context *ctxt, struct write_point_specifier wp, struct bch_io_opts io_opts, - struct bkey_s_c_extent e, + enum btree_id btree_id, + struct bkey_s_c k, enum data_cmd data_cmd, struct data_opts data_opts) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct moving_io *io; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - unsigned sectors = e.k->size, pages; + unsigned sectors = k.k->size, pages; int ret = -ENOMEM; move_ctxt_wait_event(ctxt, @@ -413,7 +417,7 @@ static int bch2_move_extent(struct bch_fs *c, SECTORS_IN_FLIGHT_PER_DEVICE); /* write path might have to decompress data: */ - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); @@ -423,37 +427,37 @@ static int bch2_move_extent(struct bch_fs *c, goto err; io->write.ctxt = ctxt; - io->read_sectors = e.k->size; - io->write_sectors = e.k->size; + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->write.op.wbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9; - bch2_bio_map(&io->write.op.wbio.bio, NULL); - if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) + if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, + GFP_KERNEL)) goto err_free; - io->rbio.opts = io_opts; + io->rbio.c = c; + io->rbio.opts = io_opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); io->rbio.bio.bi_vcnt = pages; bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k); + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, - data_cmd, data_opts, e.s_c); + data_cmd, data_opts, btree_id, k); if (ret) goto err_free_pages; atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(e.k->size, &ctxt->stats->sectors_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - trace_move_extent(e.k); + trace_move_extent(k.k); atomic_add(io->read_sectors, &ctxt->read_sectors); list_add_tail(&io->list, &ctxt->reads); @@ -463,7 +467,7 @@ static int bch2_move_extent(struct bch_fs *c, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(c, &io->rbio, e.s_c, + bch2_read_extent(c, &io->rbio, k, 0, BCH_READ_NODECODE| BCH_READ_LAST_FRAGMENT); return 0; @@ -472,20 +476,21 @@ err_free_pages: err_free: kfree(io); err: - trace_move_alloc_fail(e.k); + trace_move_alloc_fail(k.k); return ret; } -int bch2_move_data(struct bch_fs *c, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats) +static int __bch2_move_data(struct bch_fs *c, + struct moving_context *ctxt, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats, + enum btree_id btree_id) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct moving_context ctxt = { .stats = stats }; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct btree_trans trans; @@ -496,17 +501,13 @@ int bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; - stats->btree_id = BTREE_ID_EXTENTS; + stats->btree_id = btree_id; stats->pos = POS_MIN; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + iter = bch2_trans_get_iter(&trans, btree_id, start, BTREE_ITER_PREFETCH); if (rate) @@ -531,7 +532,7 @@ int bch2_move_data(struct bch_fs *c, if (unlikely(freezing(current))) { bch2_trans_unlock(&trans); - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); @@ -548,7 +549,7 @@ peek: if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (!bkey_extent_is_data(k.k)) + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; if (cur_inum != k.k->p.inode) { @@ -582,13 +583,12 @@ peek: k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, - bkey_s_c_to_extent(k), + ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); + bch2_move_ctxt_wait_for_io(ctxt); continue; } @@ -606,7 +606,32 @@ next_nondata: bch2_trans_cond_resched(&trans); } out: - bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; + + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats) +{ + struct moving_context ctxt = { .stats = stats }; + int ret; + + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + + stats->data_type = BCH_DATA_USER; + + ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_EXTENTS) ?: + __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_REFLINK); move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 71b3d2b2ddb6..0acd1720d4f8 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -25,6 +25,7 @@ struct data_opts { }; struct migrate_write { + enum btree_id btree_id; enum data_cmd data_cmd; struct data_opts data_opts; @@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, struct write_point_specifier, struct bch_io_opts, enum data_cmd, struct data_opts, - struct bkey_s_c); + enum btree_id, struct bkey_s_c); typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index b13af5662f22..710296044194 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -69,26 +69,19 @@ static bool __copygc_pred(struct bch_dev *ca, struct bkey_s_c k) { copygc_heap *h = &ca->copygc_heap; + const struct bch_extent_ptr *ptr = + bch2_bkey_has_device(k, ca->dev_idx); - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr = - bch2_extent_has_device(e, ca->dev_idx); + if (ptr) { + struct copygc_heap_entry search = { .offset = ptr->offset }; - if (ptr) { - struct copygc_heap_entry search = { .offset = ptr->offset }; + ssize_t i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); - ssize_t i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); - - return (i >= 0 && - ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].gen); - } - break; - } + return (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].gen); } return false; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index c6ec9f7effe5..97a782f44f6e 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -258,6 +258,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don\'t start filesystem, only open devices") \ + x(reconstruct_alloc, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 6bdd68177ac9..84b3fb6eb101 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -38,20 +38,15 @@ void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bkey_s_c_extent e; - - if (!bkey_extent_is_data(k.k)) - return; if (!io_opts->background_target && !io_opts->background_compression) return; - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_decode(e, p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (rebalance_ptr_pred(c, p, io_opts)) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); @@ -74,30 +69,26 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return DATA_SKIP; - - extent_for_each_ptr_decode(e, p, entry) - if (rebalance_ptr_pred(c, p, io_opts)) - goto found; - - return DATA_SKIP; -found: - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; - } - default: - return DATA_SKIP; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned nr_replicas = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + nr_replicas += !p.ptr.cached; + + if (rebalance_ptr_pred(c, p, io_opts)) + goto found; } + + if (nr_replicas < io_opts->data_replicas) + goto found; + + return DATA_SKIP; +found: + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; } struct rebalance_work { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e0df2c0a4fdf..98d9a1432e50 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -24,6 +24,42 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +/* iterate over keys read from the journal: */ + +struct journal_iter bch2_journal_iter_init(struct journal_keys *keys, + enum btree_id id) +{ + return (struct journal_iter) { + .keys = keys, + .k = keys->d, + .btree_id = id, + }; +} + +struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + while (1) { + if (iter->k == iter->keys->d + iter->keys->nr) + return bkey_s_c_null; + + if (iter->k->btree_id == iter->btree_id) + return bkey_i_to_s_c(iter->k->k); + + iter->k++; + } + + return bkey_s_c_null; +} + +struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter) +{ + if (iter->k == iter->keys->d + iter->keys->nr) + return bkey_s_c_null; + + iter->k++; + return bch2_journal_iter_peek(iter); +} + /* sort and dedup all keys in the journal: */ static void journal_entries_free(struct list_head *list) @@ -200,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, + struct bkey_i *k) { struct btree_trans trans; struct btree_iter *iter, *split_iter; @@ -211,14 +248,21 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i *split; - bool split_compressed = false; + struct bpos atomic_end; + /* + * Some extents aren't equivalent - w.r.t. what the triggers do + * - if they're split: + */ + bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) || + k->k.type == KEY_TYPE_reflink_p; + bool remark = false; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, btree_id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); @@ -237,29 +281,33 @@ retry: if (ret) goto err; - if (!split_compressed && - bch2_extent_is_compressed(bkey_i_to_s_c(k)) && - !bch2_extent_is_atomic(k, split_iter)) { + ret = bch2_extent_atomic_end(split_iter, k, &atomic_end); + if (ret) + goto err; + + if (!remark && + remark_if_split && + bkey_cmp(atomic_end, k->k.p) < 0) { ret = bch2_disk_reservation_add(c, &disk_res, k->k.size * bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); - split_compressed = true; + remark = true; } bkey_copy(split, k); bch2_cut_front(split_iter->pos, split); - bch2_extent_trim_atomic(split, split_iter); + bch2_cut_back(atomic_end, &split->k); bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split)); bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); - if (split_compressed) { + if (remark) { ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), - -((s64) k->k.size), + 0, -((s64) k->k.size), BCH_BUCKET_MARK_OVERWRITE) ?: bch2_trans_commit(&trans, &disk_res, NULL, BTREE_INSERT_ATOMIC| @@ -299,22 +347,17 @@ static int bch2_journal_replay(struct bch_fs *c, for_each_journal_key(keys, i) { replay_now_at(j, keys.journal_seq_base + i->journal_seq); - switch (i->btree_id) { - case BTREE_ID_ALLOC: + if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - break; - case BTREE_ID_EXTENTS: - ret = bch2_extent_replay_key(c, i->k); - break; - default: + else if (btree_node_type_is_extents(i->btree_id)) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else ret = bch2_btree_insert(c, i->btree_id, i->k, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY| BTREE_INSERT_NOMARK); - break; - } if (ret) { bch_err(c, "journal replay: error %d while replaying key", @@ -615,7 +658,7 @@ static int read_btree_roots(struct bch_fs *c) continue; if (i == BTREE_ID_ALLOC && - test_reconstruct_alloc(c)) { + c->opts.reconstruct_alloc) { c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); continue; } @@ -892,7 +935,9 @@ out: ret = 0; err: fsck_err: + set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); + journal_keys_free(&journal_keys); journal_entries_free(&journal_entries); kfree(clean); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index a69260d6165a..479ea46f8dcb 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -18,6 +18,17 @@ struct journal_keys { #define for_each_journal_key(keys, i) \ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) +struct journal_iter { + struct journal_keys *keys; + struct journal_key *k; + enum btree_id btree_id; +}; + +struct journal_iter bch2_journal_iter_init(struct journal_keys *, + enum btree_id); +struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *); +struct bkey_s_c bch2_journal_iter_next(struct journal_iter *); + int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 index 000000000000..dcca9c1d0f47 --- /dev/null +++ b/fs/bcachefs/reflink.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "fs-io.h" +#include "reflink.h" + +#include <linux/sched/signal.h> + +/* reflink pointers */ + +const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + if (bkey_val_bytes(p.k) != sizeof(*p.v)) + return "incorrect value size"; + + return NULL; +} + +void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); +} + +enum merge_result bch2_reflink_p_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); + struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); + + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return BCH_MERGE_NOMERGE; + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + __bch2_cut_front(l.k->p, _r); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} + +/* indirect extents */ + +const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + if (bkey_val_bytes(r.k) < sizeof(*r.v)) + return "incorrect value size"; + + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +/* + * bch2_remap_range() depends on bch2_extent_update(), which depends on various + * things tied to the linux vfs for inode updates, for now: + */ +#ifndef NO_BCACHEFS_FS + +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i_extent *e) +{ + struct bch_fs *c = trans->c; + struct btree_iter *reflink_iter; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + struct bkey_i_reflink_p *r_p; + int ret; + + for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + if (reflink_iter->pos.inode) { + bch2_btree_iter_set_pos(reflink_iter, POS_MIN); + continue; + } + + if (bkey_deleted(k.k) && e->k.size <= k.k->size) + break; + } + + if (ret) + goto err; + + /* rewind iter to start of hole, if necessary: */ + bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + + r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + + bkey_reflink_v_init(&r_v->k_i); + r_v->k.p = reflink_iter->pos; + bch2_key_resize(&r_v->k, e->k.size); + r_v->k.version = e->k.version; + + set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + + bkey_val_u64s(&e->k)); + r_v->v.refcount = 0; + memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i)); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) + return PTR_ERR(r_p); + + e->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(&e->k_i); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i)); +err: + if (!IS_ERR(reflink_iter)) { + c->reflink_hint = reflink_iter->pos.offset; + bch2_trans_iter_put(trans, reflink_iter); + } + + return ret; +} + +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +{ + struct bkey_s_c k = bch2_btree_iter_peek(iter); + + while (1) { + if (bkey_err(k)) + return k; + + if (bkey_cmp(iter->pos, end) >= 0) + return bkey_s_c_null; + + if (k.k->type == KEY_TYPE_extent || + k.k->type == KEY_TYPE_reflink_p) + return k; + + k = bch2_btree_iter_next(iter); + } +} + +s64 bch2_remap_range(struct bch_fs *c, + struct bch_inode_info *dst_inode, + struct bpos dst_start, struct bpos src_start, + u64 remap_sectors, u64 new_i_size) +{ + struct btree_trans trans; + struct btree_iter *dst_iter, *src_iter; + struct bkey_s_c src_k; + BKEY_PADDED(k) new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos dst_want, src_want; + u64 src_done, dst_done; + int ret = 0; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_REFLINK); + + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, + BTREE_ITER_INTENT, 1); + dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, + BTREE_ITER_INTENT, 2); + + while (1) { + bch2_trans_begin_updates(&trans); + trans.mem_top = 0; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto err; + } + + src_k = get_next_src(src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + goto btree_err; + + src_done = bpos_min(src_iter->pos, src_end).offset - + src_start.offset; + dst_want = POS(dst_start.inode, dst_start.offset + src_done); + + if (bkey_cmp(dst_iter->pos, dst_want) < 0) { + ret = bch2_fpunch_at(&trans, dst_iter, dst_want, + dst_inode, new_i_size); + if (ret) + goto btree_err; + continue; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); + + if (!bkey_cmp(dst_iter->pos, dst_end)) + break; + + if (src_k.k->type == KEY_TYPE_extent) { + bkey_reassemble(&new_src.k, src_k); + src_k = bkey_i_to_s_c(&new_src.k); + + bch2_cut_front(src_iter->pos, &new_src.k); + bch2_cut_back(src_end, &new_src.k.k); + + ret = bch2_make_extent_indirect(&trans, src_iter, + bkey_i_to_extent(&new_src.k)); + if (ret) + goto btree_err; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } + + if (src_k.k->type == KEY_TYPE_reflink_p) { + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(&new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_iter->pos.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); + } else { + BUG(); + } + + new_dst.k.k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k.k, + min(src_k.k->p.offset - src_iter->pos.offset, + dst_end.offset - dst_iter->pos.offset)); + + ret = bch2_extent_update(&trans, dst_inode, NULL, NULL, + dst_iter, &new_dst.k, + new_i_size, false, true, NULL); + if (ret) + goto btree_err; + + dst_done = dst_iter->pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(src_iter, src_want); +btree_err: + if (ret == -EINTR) + ret = 0; + if (ret) + goto err; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); +err: + BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + + dst_done = dst_iter->pos.offset - dst_start.offset; + new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + + ret = bch2_trans_exit(&trans) ?: ret; + + mutex_lock(&dst_inode->ei_update_lock); + if (dst_inode->v.i_size < new_i_size) { + i_size_write(&dst_inode->v, new_i_size); + ret = bch2_write_inode_size(c, dst_inode, new_i_size, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&dst_inode->ei_update_lock); + + return dst_done ?: ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 index 000000000000..327618c36d33 --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + +const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +enum merge_result bch2_reflink_p_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ +} + +const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + + +#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ +} + +#ifndef NO_BCACHEFS_FS +s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *, + struct bpos, struct bpos, u64, u64); +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 4818453c015a..bb9da2bb5a92 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -16,11 +16,16 @@ static inline int u8_cmp(u8 l, u8 r) return cmp_int(l, r); } -static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) +static void verify_replicas_entry(struct bch_replicas_entry *e) { -#ifdef CONFIG_BCACHES_DEBUG +#ifdef CONFIG_BCACHEFS_DEBUG unsigned i; + BUG_ON(e->data_type >= BCH_DATA_NR); + BUG_ON(!e->nr_devs); + BUG_ON(e->nr_required > 1 && + e->nr_required >= e->nr_devs); + for (i = 0; i + 1 < e->nr_devs; i++) BUG_ON(e->devs[i] >= e->devs[i + 1]); #endif @@ -80,7 +85,7 @@ static void extent_to_replicas(struct bkey_s_c k, continue; if (p.ec_nr) { - r->nr_devs = 0; + r->nr_required = 0; break; } @@ -113,6 +118,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, extent_to_replicas(k, e); break; case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: e->data_type = BCH_DATA_USER; extent_to_replicas(k, e); break; @@ -157,7 +163,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, }; BUG_ON(!new_entry->data_type); - verify_replicas_entry_sorted(new_entry); + verify_replicas_entry(new_entry); new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); if (!new.entries) @@ -184,7 +190,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, if (unlikely(entry_size > r->entry_size)) return -1; - verify_replicas_entry_sorted(search); + verify_replicas_entry(search); #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, @@ -215,7 +221,7 @@ static bool bch2_replicas_marked_locked(struct bch_fs *c, if (!search->nr_devs) return true; - verify_replicas_entry_sorted(search); + verify_replicas_entry(search); return __replicas_has_entry(&c->replicas, search) && (!check_gc_replicas || @@ -359,6 +365,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_cpu new_r, new_gc; int ret = -ENOMEM; + verify_replicas_entry(new_entry); + memset(&new_r, 0, sizeof(new_r)); memset(&new_gc, 0, sizeof(new_gc)); @@ -874,9 +882,8 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi goto err; err = "invalid replicas entry: bad nr_required"; - if (!e->nr_required || - (e->nr_required > 1 && - e->nr_required >= e->nr_devs)) + if (e->nr_required > 1 && + e->nr_required >= e->nr_devs) goto err; err = "invalid replicas entry: invalid device"; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 1779f755b21d..091bf7a89577 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -72,7 +72,7 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); break; case BCH_STR_HASH_CRC64: - ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key)); + ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); break; case BCH_STR_HASH_SIPHASH: SipHash24_Init(&ctx->siphash, &info->siphash_key); @@ -91,7 +91,7 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, ctx->crc32c = crc32c(ctx->crc32c, data, len); break; case BCH_STR_HASH_CRC64: - ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len); + ctx->crc64 = crc64_be(ctx->crc64, data, len); break; case BCH_STR_HASH_SIPHASH: SipHash24_Update(&ctx->siphash, data, len); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 5e1ae7e425ff..3043def884ab 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -470,9 +470,8 @@ reread: bio_reset(sb->bio); bio_set_dev(sb->bio, sb->bdev); sb->bio->bi_iter.bi_sector = offset; - sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); - bch2_bio_map(sb->bio, sb->sb); + bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); if (submit_bio_wait(sb->bio)) return "IO error"; @@ -574,13 +573,12 @@ int bch2_read_super(const char *path, struct bch_opts *opts, bio_reset(sb->bio); bio_set_dev(sb->bio, sb->bdev); sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); /* * use sb buffer to read layout, since sb buffer is page aligned but * layout won't be: */ - bch2_bio_map(sb->bio, sb->sb); + bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); err = "IO error"; if (submit_bio_wait(sb->bio)) @@ -650,11 +648,10 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) bio_reset(bio); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); - bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = write_super_endio; bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); - bch2_bio_map(bio, ca->sb_read_scratch); + bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], bio_sectors(bio)); @@ -677,13 +674,12 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) bio_reset(bio); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_iter.bi_size = - roundup((size_t) vstruct_bytes(sb), - bdev_logical_block_size(ca->disk_sb.bdev)); bio->bi_end_io = write_super_endio; bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch2_bio_map(bio, sb); + bch2_bio_map(bio, sb, + roundup((size_t) vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev))); this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], bio_sectors(bio)); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7e1b1bf43c31..4145832f4856 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -309,6 +309,7 @@ void bch2_fs_read_only(struct bch_fs *c) */ percpu_ref_kill(&c->writes); + cancel_work_sync(&c->ec_stripe_delete_work); cancel_delayed_work(&c->pd_controllers_update); /* @@ -398,6 +399,8 @@ static int bch2_fs_read_write_late(struct bch_fs *c) schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); + schedule_work(&c->ec_stripe_delete_work); + return 0; } @@ -491,6 +494,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); + bch2_fs_btree_iter_exit(c); bch2_fs_btree_cache_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); @@ -502,7 +506,6 @@ static void bch2_fs_free(struct bch_fs *c) free_percpu(c->usage[0]); kfree(c->usage_base); free_percpu(c->pcpu); - mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); @@ -755,15 +758,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || - mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, - sizeof(struct btree_iter) * BTREE_ITER_MAX + - sizeof(struct btree_insert_entry) * - (BTREE_ITER_MAX + 4)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || bch2_fs_replicas_init(c) || bch2_fs_btree_cache_init(c) || + bch2_fs_btree_iter_init(c) || bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 2aa3097aeedb..2cc433ec0e3a 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -504,48 +504,32 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) /* misc: */ -void bch2_bio_map(struct bio *bio, void *base) +void bch2_bio_map(struct bio *bio, void *base, size_t size) { - size_t size = bio->bi_iter.bi_size; - struct bio_vec *bv = bio->bi_io_vec; - - BUG_ON(!bio->bi_iter.bi_size); - BUG_ON(bio->bi_vcnt); - BUG_ON(!bio->bi_max_vecs); - - bv->bv_offset = base ? offset_in_page(base) : 0; - goto start; - - for (; size; bio->bi_vcnt++, bv++) { - BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); - - bv->bv_offset = 0; -start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, - size); - if (base) { - bv->bv_page = is_vmalloc_addr(base) + while (size) { + struct page *page = is_vmalloc_addr(base) ? vmalloc_to_page(base) : virt_to_page(base); + unsigned offset = offset_in_page(base); + unsigned len = min_t(size_t, PAGE_SIZE - offset, size); - base += bv->bv_len; - } - - size -= bv->bv_len; + BUG_ON(!bio_add_page(bio, page, len, offset)); + size -= len; + base += len; } } -int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) { - struct bio_vec *bv; - int i; + while (size) { + struct page *page = alloc_page(gfp_mask); + unsigned len = min(PAGE_SIZE, size); - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); + if (!page) return -ENOMEM; - } + + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; } return 0; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 310e958c6cdf..fa3a991453e9 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -511,8 +511,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) return x; } -void bch2_bio_map(struct bio *bio, void *base); -int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); +void bch2_bio_map(struct bio *bio, void *base, size_t); +int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); static inline sector_t bdev_sectors(struct block_device *bdev) { @@ -628,35 +628,6 @@ static inline void memmove_u64s(void *dst, const void *src, __memmove_u64s_up(dst, src, u64s); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); |