summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2019-09-20 17:43:08 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2020-05-06 17:14:17 -0400
commit1654816f36166a77df46af48a974f01942be4a38 (patch)
treebf97c29944582c240b596751b32aec4c098bc068
parent8b33578265e6fbb0712f688677bef2620861e019 (diff)
Merge with dd444a83ea bcachefs: Drop unused arg to bch2_open_buckets_stop_dev()
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/alloc_background.c212
-rw-r--r--fs/bcachefs/alloc_background.h25
-rw-r--r--fs/bcachefs/alloc_foreground.c5
-rw-r--r--fs/bcachefs/alloc_foreground.h2
-rw-r--r--fs/bcachefs/bcachefs.h12
-rw-r--r--fs/bcachefs/bcachefs_format.h28
-rw-r--r--fs/bcachefs/bkey.h4
-rw-r--r--fs/bcachefs/bkey_methods.c10
-rw-r--r--fs/bcachefs/bkey_methods.h5
-rw-r--r--fs/bcachefs/bkey_sort.c19
-rw-r--r--fs/bcachefs/bset.c78
-rw-r--r--fs/bcachefs/bset.h8
-rw-r--r--fs/bcachefs/btree_gc.c31
-rw-r--r--fs/bcachefs/btree_io.c8
-rw-r--r--fs/bcachefs/btree_iter.c532
-rw-r--r--fs/bcachefs/btree_iter.h18
-rw-r--r--fs/bcachefs/btree_types.h13
-rw-r--r--fs/bcachefs/btree_update.h29
-rw-r--r--fs/bcachefs/btree_update_interior.c21
-rw-r--r--fs/bcachefs/btree_update_leaf.c198
-rw-r--r--fs/bcachefs/buckets.c448
-rw-r--r--fs/bcachefs/buckets.h25
-rw-r--r--fs/bcachefs/buckets_types.h1
-rw-r--r--fs/bcachefs/checksum.c33
-rw-r--r--fs/bcachefs/checksum.h11
-rw-r--r--fs/bcachefs/compress.c8
-rw-r--r--fs/bcachefs/debug.c3
-rw-r--r--fs/bcachefs/ec.c134
-rw-r--r--fs/bcachefs/error.c9
-rw-r--r--fs/bcachefs/extents.c415
-rw-r--r--fs/bcachefs/extents.h151
-rw-r--r--fs/bcachefs/fs-io.c1348
-rw-r--r--fs/bcachefs/fs-io.h19
-rw-r--r--fs/bcachefs/fs.c118
-rw-r--r--fs/bcachefs/fsck.c2
-rw-r--r--fs/bcachefs/io.c404
-rw-r--r--fs/bcachefs/io.h25
-rw-r--r--fs/bcachefs/io_types.h2
-rw-r--r--fs/bcachefs/journal_io.c6
-rw-r--r--fs/bcachefs/migrate.c16
-rw-r--r--fs/bcachefs/move.c135
-rw-r--r--fs/bcachefs/move.h3
-rw-r--r--fs/bcachefs/movinggc.c27
-rw-r--r--fs/bcachefs/opts.h5
-rw-r--r--fs/bcachefs/rebalance.c51
-rw-r--r--fs/bcachefs/recovery.c85
-rw-r--r--fs/bcachefs/recovery.h11
-rw-r--r--fs/bcachefs/reflink.c300
-rw-r--r--fs/bcachefs/reflink.h32
-rw-r--r--fs/bcachefs/replicas.c25
-rw-r--r--fs/bcachefs/str_hash.h4
-rw-r--r--fs/bcachefs/super-io.c16
-rw-r--r--fs/bcachefs/super.c10
-rw-r--r--fs/bcachefs/util.c48
-rw-r--r--fs/bcachefs/util.h33
56 files changed, 3306 insertions, 1916 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index da42c4fd764d..414ea2a74a5a 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -44,6 +44,7 @@ bcachefs-y := \
quota.o \
rebalance.o \
recovery.o \
+ reflink.o \
replicas.o \
siphash.o \
super.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 43dc2f270dc6..9814179a6406 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -205,20 +205,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
get_alloc_field(a.v, &d, i));
}
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
-{
- return (struct bkey_alloc_unpacked) {
- .gen = m.gen,
- .oldest_gen = g->oldest_gen,
- .data_type = m.data_type,
- .dirty_sectors = m.dirty_sectors,
- .cached_sectors = m.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- };
-}
-
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct btree_trans trans;
@@ -232,7 +218,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
- bch2_mark_key(c, k, 0, NULL, 0,
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
@@ -244,7 +230,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
for_each_journal_key(*journal_keys, j)
if (j->btree_id == BTREE_ID_ALLOC)
- bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0,
+ bch2_mark_key(c, bkey_i_to_s_c(j->k),
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
@@ -271,46 +258,68 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
return 0;
}
-int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+enum alloc_write_ret {
+ ALLOC_WROTE,
+ ALLOC_NOWROTE,
+ ALLOC_END,
+};
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned flags)
{
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
struct bch_dev *ca;
+ struct bucket_array *ba;
+ struct bucket *g;
+ struct bucket_mark m;
+ struct bkey_alloc_unpacked old_u, new_u;
+ __BKEY_PADDED(k, 8) alloc_key; /* hack: */
+ struct bkey_i_alloc *a;
int ret;
+retry:
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- if (k->k.p.inode >= c->sb.nr_devices ||
- !c->devs[k->k.p.inode])
- return 0;
-
- ca = bch_dev_bkey_exists(c, k->k.p.inode);
+ old_u = bch2_alloc_unpack(k);
- if (k->k.p.offset >= ca->mi.nbuckets)
- return 0;
+ if (iter->pos.inode >= c->sb.nr_devices ||
+ !c->devs[iter->pos.inode])
+ return ALLOC_END;
- bch2_trans_init(&trans, c, 0, 0);
+ percpu_down_read(&c->mark_lock);
+ ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ ba = bucket_array(ca);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
- BTREE_ITER_INTENT);
+ if (iter->pos.offset >= ba->nbuckets) {
+ percpu_up_read(&c->mark_lock);
+ return ALLOC_END;
+ }
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto err;
+ g = &ba->b[iter->pos.offset];
+ m = READ_ONCE(g->mark);
+ new_u = alloc_mem_to_key(g, m);
+ percpu_up_read(&c->mark_lock);
- /* check buckets_written with btree node locked: */
- if (test_bit(k->k.p.offset, ca->buckets_written)) {
- ret = 0;
- goto err;
- }
+ if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+ return ALLOC_NOWROTE;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
+ a = bkey_alloc_init(&alloc_key.k);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, new_u);
- ret = bch2_trans_commit(&trans, NULL, NULL,
+ bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
+ BTREE_INSERT_NOMARK|
+ flags);
err:
- bch2_trans_exit(&trans);
+ if (ret == -EINTR)
+ goto retry;
return ret;
}
@@ -318,16 +327,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bucket_array *buckets;
struct bch_dev *ca;
- struct bucket *g;
- struct bucket_mark m, new;
- struct bkey_alloc_unpacked old_u, new_u;
- __BKEY_PADDED(k, 8) alloc_key; /* hack: */
- struct bkey_i_alloc *a;
- struct bkey_s_c k;
unsigned i;
- size_t b;
int ret = 0;
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
@@ -338,81 +339,24 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
for_each_rw_member(ca, c, i) {
- down_read(&ca->bucket_lock);
-restart:
- buckets = bucket_array(ca);
+ unsigned first_bucket;
- for (b = buckets->first_bucket;
- b < buckets->nbuckets;
- b++) {
- if (!buckets->b[b].mark.dirty)
- continue;
-
- bch2_btree_iter_set_pos(iter, POS(i, b));
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- old_u = bch2_alloc_unpack(k);
-
- percpu_down_read(&c->mark_lock);
- g = bucket(ca, b);
- m = READ_ONCE(g->mark);
- new_u = alloc_mem_to_key(g, m);
- percpu_up_read(&c->mark_lock);
-
- if (!m.dirty)
- continue;
-
- if ((flags & BTREE_INSERT_LAZY_RW) &&
- percpu_ref_is_zero(&c->writes)) {
- up_read(&ca->bucket_lock);
- bch2_trans_unlock(&trans);
-
- ret = bch2_fs_read_write_early(c);
- down_read(&ca->bucket_lock);
-
- if (ret)
- goto err;
- goto restart;
- }
+ percpu_down_read(&c->mark_lock);
+ first_bucket = bucket_array(ca)->first_bucket;
+ percpu_up_read(&c->mark_lock);
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, new_u);
+ bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOMARK|
- flags);
-err:
- if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
- bch_err(c, "error %i writing alloc info", ret);
- printk(KERN_CONT "dev %llu bucket %llu\n",
- iter->pos.inode, iter->pos.offset);
- printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen);
-#define x(_name, _bits) printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name);
- BCH_ALLOC_FIELDS()
-#undef x
- }
- if (ret)
+ while (1) {
+ ret = bch2_alloc_write_key(&trans, iter, flags);
+ if (ret < 0 || ret == ALLOC_END)
break;
-
- new = m;
- new.dirty = false;
- atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
-
- if (ca->buckets_written)
- set_bit(b, ca->buckets_written);
-
- bch2_trans_cond_resched(&trans);
- *wrote = true;
+ if (ret == ALLOC_WROTE)
+ *wrote = true;
+ bch2_btree_iter_next_slot(iter);
}
- up_read(&ca->bucket_lock);
- if (ret) {
+ if (ret < 0) {
percpu_ref_put(&ca->io_ref);
break;
}
@@ -420,7 +364,27 @@ err:
bch2_trans_exit(&trans);
- return ret;
+ return ret < 0 ? ret : 0;
+}
+
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ ret = bch2_alloc_write_key(&trans, iter,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK);
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
/* Bucket IO clocks: */
@@ -967,10 +931,6 @@ retry:
if (!top->nr)
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
- /* with btree still locked: */
- if (ca->buckets_written)
- set_bit(b, ca->buckets_written);
-
/*
* Make sure we flush the last journal entry that updated this
* bucket (i.e. deleting the last reference) before writing to
@@ -1204,7 +1164,7 @@ static int bch2_allocator_thread(void *arg)
*/
if (!nr ||
(nr < ALLOC_SCAN_BATCH(ca) &&
- !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
+ !fifo_empty(&ca->free[RESERVE_NONE]))) {
ret = wait_buckets_available(c, ca);
if (ret) {
up_read(&c->gc_lock);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 0c1a0f0dd2ab..501c444353fb 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,10 +13,35 @@ struct bkey_alloc_unpacked {
#undef x
};
+/* returns true if not equal */
+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
+ struct bkey_alloc_unpacked r)
+{
+ return l.gen != r.gen
+#define x(_name, _bits) || l._name != r._name
+ BCH_ALLOC_FIELDS()
+#undef x
+ ;
+}
+
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
void bch2_alloc_pack(struct bkey_i_alloc *,
const struct bkey_alloc_unpacked);
+static inline struct bkey_alloc_unpacked
+alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+{
+ return (struct bkey_alloc_unpacked) {
+ .gen = m.gen,
+ .oldest_gen = g->oldest_gen,
+ .data_type = m.data_type,
+ .dirty_sectors = m.dirty_sectors,
+ .cached_sectors = m.cached_sectors,
+ .read_time = g->io_time[READ],
+ .write_time = g->io_time[WRITE],
+ };
+}
+
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e64f8449462f..697d576802b6 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -693,8 +693,7 @@ retry_blocking:
}
void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
- struct open_buckets *obs,
- enum bch_data_type data_type)
+ struct open_buckets *obs)
{
struct open_buckets ptrs = { .nr = 0 };
struct open_bucket *ob, *ob2;
@@ -725,7 +724,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
{
mutex_lock(&wp->lock);
- bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
+ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
mutex_unlock(&wp->lock);
}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 6d8ffb0cd06d..687f973e4b3a 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -106,7 +106,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
- struct open_buckets *, enum bch_data_type);
+ struct open_buckets *);
void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
struct write_point *);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 98c2fe734626..a186aa521049 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -283,9 +283,7 @@ do { \
"Force reads to use the reconstruct path, when reading" \
"from erasure coded extents") \
BCH_DEBUG_PARAM(test_restart_gc, \
- "Test restarting mark and sweep gc when bucket gens change")\
- BCH_DEBUG_PARAM(test_reconstruct_alloc, \
- "Test reconstructing the alloc btree")
+ "Test restarting mark and sweep gc when bucket gens change")
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
@@ -359,6 +357,7 @@ enum gc_phase {
GC_PHASE_BTREE_XATTRS,
GC_PHASE_BTREE_ALLOC,
GC_PHASE_BTREE_QUOTAS,
+ GC_PHASE_BTREE_REFLINK,
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
@@ -409,7 +408,6 @@ struct bch_dev {
*/
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_nouse;
- unsigned long *buckets_written;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];
@@ -722,7 +720,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -740,12 +738,16 @@ struct bch_fs {
/* ERASURE CODING */
struct list_head ec_new_stripe_list;
struct mutex ec_new_stripe_lock;
+ u64 ec_stripe_hint;
struct bio_set ec_bioset;
struct work_struct ec_stripe_delete_work;
struct llist_head ec_stripe_delete_list;
+ /* REFLINK */
+ u64 reflink_hint;
+
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 13285936dd2d..4577d77a9f38 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -336,7 +336,9 @@ static inline void bkey_init(struct bkey *k)
x(xattr, 11) \
x(alloc, 12) \
x(quota, 13) \
- x(stripe, 14)
+ x(stripe, 14) \
+ x(reflink_p, 15) \
+ x(reflink_v, 16)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -655,7 +657,7 @@ struct bch_reservation {
/* Maximum possible size of an entire extent value: */
#define BKEY_EXTENT_VAL_U64s_MAX \
- (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
@@ -891,6 +893,24 @@ struct bch_stripe {
struct bch_extent_ptr ptrs[0];
} __attribute__((packed, aligned(8)));
+/* Reflink: */
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+
+ __le32 reservation_generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+};
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[0];
+};
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1293,6 +1313,7 @@ enum bch_sb_features {
BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */
BCH_FEATURE_EC = 4,
BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
+ BCH_FEATURE_REFLINK = 6,
BCH_FEATURE_NR,
};
@@ -1480,7 +1501,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
x(XATTRS, 3, "xattrs") \
x(ALLOC, 4, "alloc") \
x(QUOTAS, 5, "quotas") \
- x(EC, 6, "erasure_coding")
+ x(EC, 6, "erasure_coding") \
+ x(REFLINK, 7, "reflink")
enum btree_id {
#define x(kwd, val, name) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 1acff9d0fd7e..5ef66aed338d 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -50,7 +50,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
}
-#define bkey_val_end(_k) vstruct_idx((_k).v, bkey_val_u64s((_k).k))
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
@@ -552,6 +552,8 @@ BKEY_VAL_ACCESSORS(xattr);
BKEY_VAL_ACCESSORS(alloc);
BKEY_VAL_ACCESSORS(quota);
BKEY_VAL_ACCESSORS(stripe);
+BKEY_VAL_ACCESSORS(reflink_p);
+BKEY_VAL_ACCESSORS(reflink_v);
/* byte order helpers */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 27f196ef0b18..f01405dd502b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -10,9 +10,10 @@
#include "extents.h"
#include "inode.h"
#include "quota.h"
+#include "reflink.h"
#include "xattr.h"
-const char * const bch_bkey_types[] = {
+const char * const bch2_bkey_types[] = {
#define x(name, nr) #name,
BCH_BKEY_TYPES()
#undef x
@@ -144,7 +145,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
}
if (ops->key_debugcheck)
- ops->key_debugcheck(c, b, k);
+ ops->key_debugcheck(c, k);
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
@@ -159,7 +160,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
- pr_buf(out, "u64s %u type %u ", k->u64s, k->type);
+ pr_buf(out, "u64s %u type %s ", k->u64s,
+ bch2_bkey_types[k->type]);
bch2_bpos_to_text(out, k->p);
@@ -174,8 +176,6 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
if (likely(ops->val_to_text))
ops->val_to_text(out, c, k);
- else
- pr_buf(out, " %s", bch_bkey_types[k.k->type]);
}
void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 08b976633360..8568b65c1ed2 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -9,7 +9,7 @@ struct btree;
struct bkey;
enum btree_node_type;
-extern const char * const bch_bkey_types[];
+extern const char * const bch2_bkey_types[];
enum merge_result {
BCH_MERGE_NOMERGE,
@@ -26,8 +26,7 @@ struct bkey_ops {
/* Returns reason for being invalid if invalid, else NULL: */
const char * (*key_invalid)(const struct bch_fs *,
struct bkey_s_c);
- void (*key_debugcheck)(struct bch_fs *, struct btree *,
- struct bkey_s_c);
+ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(const struct bkey_format *, struct bkey_packed *);
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 9f5d9b4bf1c9..e32fad5a91ac 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -415,25 +415,22 @@ bch2_sort_repack_merge(struct bch_fs *c,
struct bkey_format *out_f,
bool filter_whiteouts)
{
- struct bkey_packed *prev = NULL, *k_packed, *next;
- struct bkey k_unpacked;
+ struct bkey_packed *prev = NULL, *k_packed;
struct bkey_s k;
struct btree_nr_keys nr;
+ BKEY_PADDED(k) tmp;
memset(&nr, 0, sizeof(nr));
- next = bch2_btree_node_iter_next_all(iter, src);
- while ((k_packed = next)) {
- /*
- * The filter might modify the size of @k's value, so advance
- * the iterator first:
- */
- next = bch2_btree_node_iter_next_all(iter, src);
-
+ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
if (filter_whiteouts && bkey_whiteout(k_packed))
continue;
- k = __bkey_disassemble(src, k_packed, &k_unpacked);
+ EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) >
+ BKEY_EXTENT_VAL_U64s_MAX);
+
+ bch2_bkey_unpack(src, &tmp.k, k_packed);
+ k = bkey_i_to_s(&tmp.k);
if (filter_whiteouts &&
bch2_bkey_normalize(c, k))
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index ef10e77ec1e5..32436ed5cc80 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -24,6 +24,16 @@
static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
struct btree *);
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+ unsigned n = ARRAY_SIZE(iter->data);
+
+ while (n && __btree_node_iter_set_end(iter, n - 1))
+ --n;
+
+ return n;
+}
+
struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
{
unsigned offset = __btree_node_key_to_offset(b, k);
@@ -110,7 +120,8 @@ void bch2_dump_btree_node_iter(struct btree *b,
{
struct btree_node_iter_set *set;
- printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+ printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+ __btree_node_iter_used(iter), b->nsets);
btree_node_iter_for_each(iter, set) {
struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
@@ -119,8 +130,8 @@ void bch2_dump_btree_node_iter(struct btree *b,
char buf[100];
bch2_bkey_to_text(&PBUF(buf), &uk);
- printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
- k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+ printk(KERN_ERR "set %zu key %u: %s\n",
+ t - b->set, set->k, buf);
}
}
@@ -182,8 +193,12 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
struct btree *b)
{
struct btree_node_iter_set *set, *s2;
+ struct bkey_packed *k, *p;
struct bset_tree *t;
+ if (bch2_btree_node_iter_end(iter))
+ return;
+
/* Verify no duplicates: */
btree_node_iter_for_each(iter, set)
btree_node_iter_for_each(iter, s2)
@@ -204,6 +219,18 @@ found:
btree_node_iter_for_each(iter, set)
BUG_ON(set != iter->data &&
btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+ k = bch2_btree_node_iter_peek_all(iter, b);
+
+ for_each_bset(b, t) {
+ if (iter->data[0].end == t->end_offset)
+ continue;
+
+ p = bch2_bkey_prev_all(b, t,
+ bch2_btree_node_iter_bset_pos(iter, b, t));
+
+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+ }
}
void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
@@ -1669,25 +1696,13 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
__bch2_btree_node_iter_advance(iter, b);
}
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
- unsigned n = ARRAY_SIZE(iter->data);
-
- while (n && __btree_node_iter_set_end(iter, n - 1))
- --n;
-
- return n;
-}
-
/*
* Expensive:
*/
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
- struct btree *b,
- unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+ struct btree *b)
{
struct bkey_packed *k, *prev = NULL;
- struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
struct btree_node_iter_set *set;
struct bset_tree *t;
unsigned end = 0;
@@ -1695,9 +1710,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
- k = bch2_bkey_prev_filter(b, t,
- bch2_btree_node_iter_bset_pos(iter, b, t),
- min_key_type);
+ k = bch2_bkey_prev_all(b, t,
+ bch2_btree_node_iter_bset_pos(iter, b, t));
if (k &&
(!prev || bkey_iter_cmp(b, k, prev) > 0)) {
prev = k;
@@ -1706,7 +1720,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
}
if (!prev)
- goto out;
+ return NULL;
/*
* We're manually memmoving instead of just calling sort() to ensure the
@@ -1727,18 +1741,20 @@ found:
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
-out:
- if (btree_keys_expensive_checks(b)) {
- struct btree_node_iter iter2 = *iter;
- if (prev)
- __bch2_btree_node_iter_advance(&iter2, b);
+ bch2_btree_node_iter_verify(iter, b);
+ return prev;
+}
- while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
- BUG_ON(k->type >= min_key_type);
- __bch2_btree_node_iter_advance(&iter2, b);
- }
- }
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+ struct btree *b,
+ unsigned min_key_type)
+{
+ struct bkey_packed *prev;
+
+ do {
+ prev = bch2_btree_node_iter_prev_all(iter, b);
+ } while (prev && prev->type < min_key_type);
return prev;
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 17c239947300..643bd9e8bc4d 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -528,16 +528,12 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
return ret;
}
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+ struct btree *);
struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
struct btree *, unsigned);
static inline struct bkey_packed *
-bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
-{
- return bch2_btree_node_iter_prev_filter(iter, b, 0);
-}
-
-static inline struct bkey_packed *
bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
{
return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a458cfe0e92d..f4adb07a3de2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -142,20 +142,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
if (mustfix_fsck_err_on(!g->gen_valid, c,
- "found ptr with missing gen in alloc btree,\n"
- "type %u gen %u",
- k.k->type, ptr->gen)) {
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr),
+ bch2_data_types[ptr_data_type(k.k, ptr)],
+ ptr->gen)) {
g2->_mark.gen = g->_mark.gen = ptr->gen;
- g2->_mark.dirty = g->_mark.dirty = true;
g2->gen_valid = g->gen_valid = true;
}
if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
- "%u ptr gen in the future: %u > %u",
- k.k->type, ptr->gen, g->mark.gen)) {
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr),
+ bch2_data_types[ptr_data_type(k.k, ptr)],
+ ptr->gen, g->mark.gen)) {
g2->_mark.gen = g->_mark.gen = ptr->gen;
- g2->_mark.dirty = g->_mark.dirty = true;
g2->gen_valid = g->gen_valid = true;
+ g2->_mark.data_type = 0;
+ g2->_mark.dirty_sectors = 0;
+ g2->_mark.cached_sectors = 0;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
}
}
@@ -171,7 +175,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
- bch2_mark_key(c, k, k.k->size, NULL, 0, flags);
+ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
fsck_err:
return ret;
}
@@ -418,7 +422,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0,
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_GC);
mutex_unlock(&c->btree_interior_update_lock);
@@ -525,7 +530,6 @@ static int bch2_gc_done(struct bch_fs *c,
": got %u, should be %u", i, b, \
dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
- dst->b[b]._mark.dirty = true; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -577,10 +581,7 @@ static int bch2_gc_done(struct bch_fs *c,
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
- if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
- dst->b[b].oldest_gen = src->b[b].oldest_gen;
- dst->b[b]._mark.dirty = true;
- }
+ dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
};
@@ -761,6 +762,8 @@ out:
percpu_down_write(&c->mark_lock);
bch2_gc_free(c);
percpu_up_write(&c->mark_lock);
+ /* flush fsck errors, reset counters */
+ bch2_flush_fsck_errs(c);
goto again;
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5652f354b910..b6e286c36b86 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -510,7 +510,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -1038,10 +1038,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
bio->bi_end_io = btree_node_read_endio;
bio->bi_private = b;
- bch2_bio_map(bio, b->data);
+ bch2_bio_map(bio, b->data, btree_bytes(c));
set_btree_node_read_in_flight(b);
@@ -1502,11 +1501,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio->wbio.order = order;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
- wbio->wbio.bio.bi_iter.bi_size = sectors_to_write << 9;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
- bch2_bio_map(&wbio->wbio.bio, data);
+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
* If we're appending to a leaf node, we don't technically need FUA -
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8955555d6603..40cd87d73a4f 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -86,7 +86,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
struct btree_iter *linked;
unsigned readers = 0;
- EBUG_ON(btree_node_read_locked(iter, b->level));
+ EBUG_ON(!btree_node_intent_locked(iter, b->level));
trans_for_each_iter(iter->trans, linked)
if (linked->l[b->level].b == b &&
@@ -496,6 +496,23 @@ static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
#endif
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+ struct btree *b,
+ struct bset_tree *t,
+ struct bkey_packed *k)
+{
+ struct btree_node_iter_set *set;
+
+ btree_node_iter_for_each(iter, set)
+ if (set->end == t->end_offset) {
+ set->k = __btree_node_key_to_offset(b, k);
+ bch2_btree_node_iter_sort(iter, b);
+ return;
+ }
+
+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
@@ -509,6 +526,10 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
unsigned offset = __btree_node_key_to_offset(b, where);
int shift = new_u64s - clobber_u64s;
unsigned old_end = t->end_offset - shift;
+ unsigned orig_iter_pos = node_iter->data[0].k;
+ bool iter_current_key_modified =
+ orig_iter_pos >= offset &&
+ orig_iter_pos <= offset + clobber_u64s;
btree_node_iter_for_each(node_iter, set)
if (set->end == old_end)
@@ -517,17 +538,12 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
btree_iter_pos_cmp(iter, b, where) > 0) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
bch2_btree_node_iter_push(node_iter, b, where, end);
-
- if (!b->level &&
- node_iter == &iter->l[0].iter)
- bkey_disassemble(b,
- bch2_btree_node_iter_peek_all(node_iter, b),
- &iter->k);
+ goto fixup_done;
+ } else {
+ /* Iterator is after key that changed */
+ return;
}
- return;
found:
set->end = t->end_offset;
@@ -543,85 +559,66 @@ found:
if (set->k == set->end)
bch2_btree_node_iter_set_drop(node_iter, set);
} else {
+ /* Iterator is after key that changed */
set->k = (int) set->k + shift;
- goto iter_current_key_not_modified;
+ return;
}
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
bch2_btree_node_iter_sort(node_iter, b);
- if (!b->level && node_iter == &iter->l[0].iter) {
- /*
- * not legal to call bkey_debugcheck() here, because we're
- * called midway through the update path after update has been
- * marked but before deletes have actually happened:
- */
-#if 0
- __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
-#endif
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *k =
- bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
- if (unlikely(!k))
- iter->k.type = KEY_TYPE_deleted;
- else
- bkey_disassemble(l->b, k, &iter->k);
- }
-iter_current_key_not_modified:
+fixup_done:
+ if (node_iter->data[0].k != orig_iter_pos)
+ iter_current_key_modified = true;
/*
- * Interior nodes are special because iterators for interior nodes don't
- * obey the usual invariants regarding the iterator position:
- *
- * We may have whiteouts that compare greater than the iterator
- * position, and logically should be in the iterator, but that we
- * skipped past to find the first live key greater than the iterator
- * position. This becomes an issue when we insert a new key that is
- * greater than the current iterator position, but smaller than the
- * whiteouts we've already skipped past - this happens in the course of
- * a btree split.
- *
- * We have to rewind the iterator past to before those whiteouts here,
- * else bkey_node_iter_prev() is not going to work and who knows what
- * else would happen. And we have to do it manually, because here we've
- * already done the insert and the iterator is currently inconsistent:
- *
- * We've got multiple competing invariants, here - we have to be careful
- * about rewinding iterators for interior nodes, because they should
- * always point to the key for the child node the btree iterator points
- * to.
+ * When a new key is added, and the node iterator now points to that
+ * key, the iterator might have skipped past deleted keys that should
+ * come after the key the iterator now points to. We have to rewind to
+ * before those deleted keys - otherwise
+ * bch2_btree_node_iter_prev_all() breaks:
*/
- if (b->level && new_u64s &&
- btree_iter_pos_cmp(iter, b, where) > 0) {
+ if (!bch2_btree_node_iter_end(node_iter) &&
+ iter_current_key_modified &&
+ (b->level ||
+ (iter->flags & BTREE_ITER_IS_EXTENTS))) {
struct bset_tree *t;
- struct bkey_packed *k;
+ struct bkey_packed *k, *k2, *p;
+
+ k = bch2_btree_node_iter_peek_all(node_iter, b);
for_each_bset(b, t) {
- if (bch2_bkey_to_bset(b, where) == t)
+ bool set_pos = false;
+
+ if (node_iter->data[0].end == t->end_offset)
continue;
- k = bch2_bkey_prev_all(b, t,
- bch2_btree_node_iter_bset_pos(node_iter, b, t));
- if (k &&
- bkey_iter_cmp(b, k, where) > 0) {
- struct btree_node_iter_set *set;
- unsigned offset =
- __btree_node_key_to_offset(b, bkey_next(k));
-
- btree_node_iter_for_each(node_iter, set)
- if (set->k == offset) {
- set->k = __btree_node_key_to_offset(b, k);
- bch2_btree_node_iter_sort(node_iter, b);
- goto next_bset;
- }
-
- bch2_btree_node_iter_push(node_iter, b, k,
- btree_bkey_last(b, t));
+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+ while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+ bkey_iter_cmp(b, k, p) < 0) {
+ k2 = p;
+ set_pos = true;
}
-next_bset:
- t = t;
+
+ if (set_pos)
+ btree_node_iter_set_set_pos(node_iter,
+ b, t, k2);
+ }
+ }
+
+ if (!b->level &&
+ node_iter == &iter->l[0].iter &&
+ iter_current_key_modified) {
+ struct bkey_packed *k =
+ bch2_btree_node_iter_peek_all(node_iter, b);
+
+ if (likely(k)) {
+ bkey_disassemble(b, k, &iter->k);
+ } else {
+ /* XXX: for extents, calculate size of hole? */
+ iter->k.type = KEY_TYPE_deleted;
}
+
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
}
@@ -635,14 +632,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct btree_iter *linked;
- if (node_iter != &iter->l[b->level].iter)
+ if (node_iter != &iter->l[b->level].iter) {
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
- where, clobber_u64s, new_u64s);
+ where, clobber_u64s, new_u64s);
+ bch2_btree_node_iter_verify(node_iter, b);
+ }
- trans_for_each_iter_with_node(iter->trans, b, linked)
+ trans_for_each_iter_with_node(iter->trans, b, linked) {
__bch2_btree_node_iter_fix(linked, b,
- &linked->l[b->level].iter, t,
- where, clobber_u64s, new_u64s);
+ &linked->l[b->level].iter, t,
+ where, clobber_u64s, new_u64s);
+ __bch2_btree_iter_verify(linked, b);
+ }
}
static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
@@ -685,6 +686,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
bch2_btree_node_iter_peek(&l->iter, l->b));
}
+static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
+ struct btree_iter_level *l)
+{
+ return __btree_iter_unpack(iter, l, &iter->k,
+ bch2_btree_node_iter_prev(&l->iter, l->b));
+}
+
static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
struct btree_iter_level *l,
int max_advance)
@@ -743,18 +751,29 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
btree_node_unlock(iter, b->level + 1);
}
+static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+ struct btree *b)
+{
+ return bkey_cmp(iter->pos, b->data->min_key) < 0;
+}
+
static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
struct btree *b)
{
- return __btree_iter_pos_cmp(iter, NULL,
- bkey_to_packed(&b->key), true) < 0;
+ int cmp = bkey_cmp(b->key.k.p, iter->pos);
+
+ if (!cmp &&
+ (iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ bkey_cmp(b->key.k.p, POS_MAX))
+ cmp = -1;
+ return cmp < 0;
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
struct btree *b)
{
return iter->btree_id == b->btree_id &&
- bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
+ !btree_iter_pos_before_node(iter, b) &&
!btree_iter_pos_after_node(iter, b);
}
@@ -956,10 +975,10 @@ static void btree_iter_up(struct btree_iter *iter)
btree_node_unlock(iter, iter->level++);
}
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *);
static int __btree_iter_traverse_all(struct btree_trans *trans,
- struct btree_iter *orig_iter, int ret)
+ struct btree_iter *orig_iter, int ret)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
@@ -1003,7 +1022,7 @@ retry_all:
iter = &trans->iters[sorted[i]];
do {
- ret = __bch2_btree_iter_traverse(iter);
+ ret = btree_iter_traverse_one(iter);
} while (ret == -EINTR);
if (ret)
@@ -1021,16 +1040,27 @@ int bch2_btree_iter_traverse_all(struct btree_trans *trans)
return __btree_iter_traverse_all(trans, NULL, 0);
}
-static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
- bool check_pos)
+static inline bool btree_iter_good_node(struct btree_iter *iter,
+ unsigned l, int check_pos)
+{
+ if (!is_btree_node(iter, l) ||
+ !bch2_btree_node_relock(iter, l))
+ return false;
+
+ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+ return false;
+ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+ return false;
+ return true;
+}
+
+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+ int check_pos)
{
unsigned l = iter->level;
while (btree_iter_node(iter, l) &&
- (!is_btree_node(iter, l) ||
- !bch2_btree_node_relock(iter, l) ||
- (check_pos &&
- !btree_iter_pos_in_node(iter, iter->l[l].b)))) {
+ !btree_iter_good_node(iter, l, check_pos)) {
btree_node_unlock(iter, l);
iter->l[l].b = BTREE_ITER_NO_NODE_UP;
l++;
@@ -1048,7 +1078,7 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter)
{
unsigned depth_want = iter->level;
@@ -1062,7 +1092,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
* XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
* here unnecessary
*/
- iter->level = btree_iter_up_until_locked(iter, true);
+ iter->level = btree_iter_up_until_good_node(iter, 0);
/*
* If we've got a btree node locked (i.e. we aren't about to relock the
@@ -1070,8 +1100,11 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
*
* XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
*/
- if (btree_iter_node(iter, iter->level))
+ if (btree_iter_node(iter, iter->level)) {
+ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
+
btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
+ }
/*
* Note: iter->nodes[iter->level] may be temporarily NULL here - that
@@ -1100,12 +1133,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
return 0;
}
-int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
ret = bch2_trans_cond_resched(iter->trans) ?:
- __bch2_btree_iter_traverse(iter);
+ btree_iter_traverse_one(iter);
if (unlikely(ret))
ret = __btree_iter_traverse_all(iter->trans, iter, ret);
@@ -1234,19 +1267,11 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
}
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
{
- int cmp = bkey_cmp(new_pos, iter->pos);
- unsigned level;
-
- if (!cmp)
- return;
+ unsigned l = btree_iter_up_until_good_node(iter, cmp);
- iter->pos = new_pos;
-
- level = btree_iter_up_until_locked(iter, true);
-
- if (btree_iter_node(iter, level)) {
+ if (btree_iter_node(iter, l)) {
/*
* We might have to skip over many keys, or just a few: try
* advancing the node iterator, and if we have to skip over too
@@ -1254,37 +1279,98 @@ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
* is expensive).
*/
if (cmp < 0 ||
- !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
- __btree_iter_init(iter, level);
+ !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
+ __btree_iter_init(iter, l);
/* Don't leave it locked if we're not supposed to: */
- if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, level);
+ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(iter, l);
}
- if (level != iter->level)
+ return l;
+}
+
+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ int cmp = bkey_cmp(new_pos, iter->pos);
+ unsigned l;
+
+ if (!cmp)
+ return;
+
+ iter->pos = new_pos;
+
+ l = btree_iter_pos_changed(iter, cmp);
+
+ if (l != iter->level)
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
else
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+{
+ struct btree_iter_level *l = &iter->l[0];
+
+ iter->pos = l->b->key.k.p;
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+
+ if (!bkey_cmp(iter->pos, POS_MAX)) {
+ bkey_init(&iter->k);
+ iter->k.p = POS_MAX;
+ return false;
+ }
+
+ iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+ btree_iter_pos_changed(iter, 1);
+ return true;
+}
+
+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+{
+ struct btree_iter_level *l = &iter->l[0];
+
+ iter->pos = l->b->data->min_key;
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+
+ if (!bkey_cmp(iter->pos, POS_MIN)) {
+ bkey_init(&iter->k);
+ iter->k.p = POS_MIN;
+ return false;
+ }
+
+ iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
+ btree_iter_pos_changed(iter, -1);
+ return true;
+}
+
static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
struct bkey_s_c ret = { .k = &iter->k };
if (!bkey_deleted(&iter->k)) {
- EBUG_ON(bch2_btree_node_iter_end(&l->iter));
- ret.v = bkeyp_val(&l->b->format,
- __bch2_btree_node_iter_peek_all(&l->iter, l->b));
+ struct bkey_packed *_k =
+ __bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+ ret.v = bkeyp_val(&l->b->format, _k);
+
+ if (debug_check_iterators(iter->trans->c)) {
+ struct bkey k = bkey_unpack_key(l->b, _k);
+ BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
+ }
+
+ if (debug_check_bkeys(iter->trans->c))
+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
}
- if (debug_check_bkeys(iter->trans->c) &&
- !bkey_deleted(ret.k))
- bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
return ret;
}
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
@@ -1297,24 +1383,16 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
return btree_iter_peek_uptodate(iter);
while (1) {
- if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
- }
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
k = __btree_iter_peek(iter, l);
if (likely(k.k))
break;
- /* got to the end of the leaf, iterator needs to be traversed: */
- iter->pos = l->b->key.k.p;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-
- if (!bkey_cmp(iter->pos, POS_MAX))
+ if (!btree_iter_set_pos_to_next_leaf(iter))
return bkey_s_c_null;
-
- iter->pos = btree_type_successor(iter->btree_id, iter->pos);
}
/*
@@ -1329,22 +1407,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
return k;
}
-static noinline
-struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- iter->pos = l->b->key.k.p;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-
- if (!bkey_cmp(iter->pos, POS_MAX))
- return bkey_s_c_null;
-
- iter->pos = btree_type_successor(iter->btree_id, iter->pos);
-
- return bch2_btree_iter_peek(iter);
-}
-
+/**
+ * bch2_btree_iter_next: returns first key greater than iterator's current
+ * position
+ */
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
@@ -1353,15 +1419,19 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
- iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
-
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ return bkey_s_c_null;
+
/*
* XXX: when we just need to relock we should be able to avoid
* calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
* for that to work
*/
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+
+ bch2_btree_iter_set_pos(iter,
+ btree_type_successor(iter->btree_id, iter->k.p));
return bch2_btree_iter_peek(iter);
}
@@ -1369,9 +1439,12 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
do {
bch2_btree_node_iter_advance(&l->iter, l->b);
p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (unlikely(!p))
- return bch2_btree_iter_peek_next_leaf(iter);
- } while (bkey_whiteout(p));
+ } while (likely(p) && bkey_whiteout(p));
+
+ if (unlikely(!p))
+ return btree_iter_set_pos_to_next_leaf(iter)
+ ? bch2_btree_iter_peek(iter)
+ : bkey_s_c_null;
k = __btree_iter_unpack(iter, l, &iter->k, p);
@@ -1380,51 +1453,79 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
return k;
}
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+/**
+ * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * iterator's current position
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *p;
struct bkey_s_c k;
int ret;
bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
- k = bch2_btree_iter_peek(iter);
- if (IS_ERR(k.k))
- return k;
- }
+ if (iter->uptodate == BTREE_ITER_UPTODATE)
+ return btree_iter_peek_uptodate(iter);
while (1) {
- p = bch2_btree_node_iter_prev(&l->iter, l->b);
- if (likely(p))
- break;
-
- iter->pos = l->b->data->min_key;
- if (!bkey_cmp(iter->pos, POS_MIN))
- return bkey_s_c_null;
-
- bch2_btree_iter_set_pos(iter,
- btree_type_predecessor(iter->btree_id, iter->pos));
-
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
- p = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (p)
+ k = __btree_iter_peek(iter, l);
+ if (!k.k ||
+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ k = __btree_iter_prev(iter, l);
+
+ if (likely(k.k))
break;
- }
- k = __btree_iter_unpack(iter, l, &iter->k, p);
+ if (!btree_iter_set_pos_to_prev_leaf(iter))
+ return bkey_s_c_null;
+ }
EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
-
iter->pos = bkey_start_pos(k.k);
iter->uptodate = BTREE_ITER_UPTODATE;
return k;
}
+/**
+ * bch2_btree_iter_prev: returns first key less than iterator's current
+ * position
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+ struct btree_iter_level *l = &iter->l[0];
+ struct bkey_s_c k;
+
+ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+ if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+ /*
+ * XXX: when we just need to relock we should be able to avoid
+ * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+ * for that to work
+ */
+ iter->pos = btree_type_predecessor(iter->btree_id,
+ iter->pos);
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+
+ return bch2_btree_iter_peek_prev(iter);
+ }
+
+ k = __btree_iter_prev(iter, l);
+ if (unlikely(!k.k))
+ return btree_iter_set_pos_to_prev_leaf(iter)
+ ? bch2_btree_iter_peek(iter)
+ : bkey_s_c_null;
+
+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0);
+ iter->pos = bkey_start_pos(k.k);
+ return k;
+}
+
static inline struct bkey_s_c
__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
{
@@ -1436,8 +1537,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
recheck:
while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
- bkey_deleted(k.k) &&
- bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+ bkey_cmp(k.k->p, iter->pos) <= 0)
bch2_btree_node_iter_advance(&l->iter, l->b);
/*
@@ -1477,6 +1577,8 @@ recheck:
EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
EBUG_ON(bkey_deleted(k.k));
iter->uptodate = BTREE_ITER_UPTODATE;
+
+ __bch2_btree_iter_verify(iter, l->b);
return k;
}
@@ -1507,6 +1609,8 @@ recheck:
iter->k = n;
iter->uptodate = BTREE_ITER_UPTODATE;
+
+ __bch2_btree_iter_verify(iter, l->b);
return (struct bkey_s_c) { &iter->k, NULL };
}
@@ -1539,19 +1643,18 @@ recheck:
goto recheck;
}
- if (k.k &&
- !bkey_deleted(k.k) &&
- !bkey_cmp(iter->pos, k.k->p)) {
- iter->uptodate = BTREE_ITER_UPTODATE;
- return k;
- } else {
+ if (!k.k ||
+ bkey_deleted(k.k) ||
+ bkey_cmp(iter->pos, k.k->p)) {
/* hole */
bkey_init(&iter->k);
iter->k.p = iter->pos;
-
- iter->uptodate = BTREE_ITER_UPTODATE;
- return (struct bkey_s_c) { &iter->k, NULL };
+ k = (struct bkey_s_c) { &iter->k, NULL };
}
+
+ iter->uptodate = BTREE_ITER_UPTODATE;
+ __bch2_btree_iter_verify(iter, l->b);
+ return k;
}
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
@@ -1563,11 +1666,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->uptodate == BTREE_ITER_UPTODATE)
return btree_iter_peek_uptodate(iter);
- if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
- }
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
return __bch2_btree_iter_peek_slot(iter);
}
@@ -1669,7 +1770,10 @@ int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
- void *new_iters, *new_updates;
+ void *new_iters, *new_updates, *new_sorted;
+ size_t iters_bytes;
+ size_t updates_bytes;
+ size_t sorted_bytes;
new_size = roundup_pow_of_two(new_size);
@@ -1682,9 +1786,13 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
bch2_trans_unlock(trans);
- new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
- sizeof(struct btree_insert_entry) * (new_size + 4),
- GFP_NOFS);
+ iters_bytes = sizeof(struct btree_iter) * new_size;
+ updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4);
+ sorted_bytes = sizeof(u8) * (new_size + 4);
+
+ new_iters = kmalloc(iters_bytes +
+ updates_bytes +
+ sorted_bytes, GFP_NOFS);
if (new_iters)
goto success;
@@ -1693,7 +1801,8 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
trans->used_mempool = true;
success:
- new_updates = new_iters + sizeof(struct btree_iter) * new_size;
+ new_updates = new_iters + iters_bytes;
+ new_sorted = new_updates + updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
@@ -1708,9 +1817,10 @@ success:
if (trans->iters != trans->iters_onstack)
kfree(trans->iters);
- trans->iters = new_iters;
- trans->updates = new_updates;
- trans->size = new_size;
+ trans->iters = new_iters;
+ trans->updates = new_updates;
+ trans->updates_sorted = new_sorted;
+ trans->size = new_size;
if (trans->iters_live) {
trace_trans_restart_iters_realloced(trans->ip, trans->size);
@@ -1779,6 +1889,12 @@ found:
iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+ if ((iter->flags & BTREE_ITER_INTENT) &&
+ !bch2_btree_iter_upgrade(iter, 1)) {
+ trace_trans_restart_upgrade(trans->ip);
+ return ERR_PTR(-EINTR);
+ }
}
BUG_ON(iter->btree_id != btree_id);
@@ -1949,6 +2065,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
+ trans->updates_sorted = trans->updates_sorted_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
@@ -1973,3 +2090,18 @@ int bch2_trans_exit(struct btree_trans *trans)
return trans->error ? -EIO : 0;
}
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+ mempool_exit(&c->btree_iters_pool);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+ unsigned nr = BTREE_ITER_MAX;
+
+ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+ sizeof(struct btree_iter) * nr +
+ sizeof(struct btree_insert_entry) * (nr + 4) +
+ sizeof(u8) * (nr + 4));
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9483ec8913e3..e4967215e1d9 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -134,7 +134,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+
+static inline int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+ ? __bch2_btree_iter_traverse(iter)
+ : 0;
+}
+
int bch2_btree_iter_traverse_all(struct btree_trans *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
@@ -142,6 +151,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
@@ -242,7 +253,7 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
(_start), (_flags))) ?: \
PTR_ERR_OR_ZERO(((_k) = \
__bch2_btree_iter_peek(_iter, _flags)).k); \
- !ret && (_k).k; \
+ !_ret && (_k).k; \
(_ret) = PTR_ERR_OR_ZERO(((_k) = \
__bch2_btree_iter_next(_iter, _flags)).k))
@@ -303,4 +314,7 @@ void *bch2_trans_kmalloc(struct btree_trans *, size_t);
void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
int bch2_trans_exit(struct btree_trans *);
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 91aa30a6ed2f..b0da09630911 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -261,8 +261,6 @@ struct btree_insert_entry {
};
bool deferred;
- bool triggered;
- bool marked;
};
#define BTREE_ITER_MAX 64
@@ -291,6 +289,7 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
+ u8 *updates_sorted;
/* update path: */
struct journal_res journal_res;
@@ -302,6 +301,7 @@ struct btree_trans {
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[6];
+ u8 updates_sorted_onstack[6];
struct replicas_delta_list *fs_usage_deltas;
};
@@ -461,7 +461,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- return type == BKEY_TYPE_EXTENTS;
+ switch (type) {
+ case BKEY_TYPE_EXTENTS:
+ case BKEY_TYPE_REFLINK:
+ return true;
+ default:
+ return false;
+ }
}
static inline bool btree_node_is_extents(struct btree *b)
@@ -477,6 +483,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_INODES:
case BKEY_TYPE_EC:
+ case BKEY_TYPE_REFLINK:
return true;
default:
return false;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 616c103c05ec..36e34b3d9213 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -43,7 +43,6 @@ enum {
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
- __BTREE_INSERT_NOMARK_INSERT,
__BTREE_INSERT_NOMARK_OVERWRITES,
__BTREE_INSERT_NOMARK,
__BTREE_INSERT_MARK_INMEM,
@@ -81,9 +80,6 @@ enum {
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-/* Don't mark new key, just overwrites: */
-#define BTREE_INSERT_NOMARK_INSERT (1 << __BTREE_INSERT_NOMARK_INSERT)
-
/* Don't mark overwrites, just new key: */
#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES)
@@ -123,8 +119,13 @@ int bch2_trans_commit(struct btree_trans *,
struct disk_reservation *,
u64 *, unsigned);
-struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
- struct btree_insert_entry);
+static inline void bch2_trans_update(struct btree_trans *trans,
+ struct btree_insert_entry entry)
+{
+ EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
+
+ trans->updates[trans->nr_updates++] = entry;
+}
#define bch2_trans_do(_c, _journal_seq, _flags, _do) \
({ \
@@ -144,18 +145,6 @@ struct btree_insert_entry *bch2_trans_update(struct btree_trans *,
_ret; \
})
-/*
- * We sort transaction entries so that if multiple iterators point to the same
- * leaf node they'll be adjacent:
- */
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i != trans->updates &&
- !i->deferred &&
- i[0].iter->l[0].b == i[-1].iter->l[0].b;
-}
-
#define __trans_next_update(_trans, _i, _filter) \
({ \
while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
@@ -175,8 +164,4 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
#define trans_for_each_update_iter(trans, i) \
__trans_for_each_update(trans, i, !(i)->deferred)
-#define trans_for_each_update_leaf(trans, i) \
- __trans_for_each_update(trans, i, !(i)->deferred && \
- !same_leaf_as_prev(trans, i))
-
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 9294137719df..6813eddd26f5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -194,7 +194,7 @@ found:
: gc_pos_btree_root(as->btree_id)) >= 0 &&
gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_OVERWRITE|
BCH_BUCKET_MARK_GC);
}
@@ -266,11 +266,12 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
{
BUG_ON(!pending->index_update_done);
- bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
- BCH_BUCKET_MARK_OVERWRITE);
+ bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+ 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
- bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
+ bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_OVERWRITE|
BCH_BUCKET_MARK_GC);
}
@@ -1077,11 +1078,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
fs_usage = bch2_fs_usage_scratch_get(c);
bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
- 0, fs_usage, 0,
+ 0, 0, fs_usage, 0,
BCH_BUCKET_MARK_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_INSERT|
BCH_BUCKET_MARK_GC);
@@ -1175,12 +1176,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
fs_usage = bch2_fs_usage_scratch_get(c);
bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, fs_usage, 0,
+ 0, 0, fs_usage, 0,
BCH_BUCKET_MARK_INSERT);
if (gc_visited(c, gc_pos_btree_node(b)))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_INSERT|
BCH_BUCKET_MARK_GC);
@@ -2003,11 +2004,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
fs_usage = bch2_fs_usage_scratch_get(c);
bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
- 0, fs_usage, 0,
+ 0, 0, fs_usage, 0,
BCH_BUCKET_MARK_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_INSERT||
BCH_BUCKET_MARK_GC);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 4f12108bd6fe..0d32fb8726c7 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -19,6 +19,26 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+ unsigned sorted_idx)
+{
+ struct btree_insert_entry *i = trans->updates +
+ trans->updates_sorted[sorted_idx];
+ struct btree_insert_entry *prev = sorted_idx
+ ? trans->updates + trans->updates_sorted[sorted_idx - 1]
+ : NULL;
+
+ return !i->deferred &&
+ prev &&
+ i->iter->l[0].b == prev->iter->l[0].b;
+}
+
+#define trans_for_each_update_sorted(_trans, _i, _iter) \
+ for (_iter = 0; \
+ _iter < _trans->nr_updates && \
+ (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \
+ _iter++)
+
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
@@ -36,20 +56,21 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
bch2_btree_init_next(c, b, iter);
}
-static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans)
+static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ unsigned iter;
- trans_for_each_update_leaf(trans, i)
- bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
-}
-
-static void btree_trans_unlock_write(struct btree_trans *trans)
-{
- struct btree_insert_entry *i;
+ trans_for_each_update_sorted(trans, i, iter) {
+ if (same_leaf_as_prev(trans, iter))
+ continue;
- trans_for_each_update_leaf(trans, i)
- bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+ if (lock)
+ bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+ else
+ bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+ }
}
static inline int btree_trans_cmp(struct btree_insert_entry l,
@@ -59,6 +80,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
btree_iter_cmp(l.iter, r.iter);
}
+static inline void btree_trans_sort_updates(struct btree_trans *trans)
+{
+ struct btree_insert_entry *l, *r;
+ unsigned nr = 0, pos;
+
+ trans_for_each_update(trans, l) {
+ for (pos = 0; pos < nr; pos++) {
+ r = trans->updates + trans->updates_sorted[pos];
+
+ if (btree_trans_cmp(*l, *r) <= 0)
+ break;
+ }
+
+ memmove(&trans->updates_sorted[pos + 1],
+ &trans->updates_sorted[pos],
+ (nr - pos) * sizeof(trans->updates_sorted[0]));
+
+ trans->updates_sorted[pos] = l - trans->updates;
+ nr++;
+ }
+
+ BUG_ON(nr != trans->nr_updates);
+}
+
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@@ -106,7 +151,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
bch2_bset_delete(b, k, clobber_u64s);
bch2_btree_node_iter_fix(iter, b, node_iter,
k, clobber_u64s, 0);
- bch2_btree_iter_verify(iter, b);
return true;
}
@@ -116,7 +160,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
k->type = KEY_TYPE_deleted;
bch2_btree_node_iter_fix(iter, b, node_iter, k,
k->u64s, k->u64s);
- bch2_btree_iter_verify(iter, b);
if (bkey_whiteout(&insert->k)) {
reserve_whiteout(b, k);
@@ -138,10 +181,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
clobber_u64s = 0;
overwrite:
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
- if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- clobber_u64s, k->u64s);
- bch2_btree_iter_verify(iter, b);
+ bch2_btree_node_iter_fix(iter, b, node_iter, k,
+ clobber_u64s, k->u64s);
return true;
}
@@ -400,8 +441,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
- !bch2_extent_is_atomic(i->k, i->iter));
-
+ bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
!(trans->flags & BTREE_INSERT_ATOMIC));
}
@@ -489,12 +529,12 @@ static int btree_trans_check_can_insert(struct btree_trans *trans,
struct btree_insert_entry **stopped_at)
{
struct btree_insert_entry *i;
- unsigned u64s = 0;
+ unsigned iter, u64s = 0;
int ret;
- trans_for_each_update_iter(trans, i) {
+ trans_for_each_update_sorted(trans, i, iter) {
/* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, i))
+ if (!same_leaf_as_prev(trans, iter))
u64s = 0;
u64s += i->k->k.u64s;
@@ -522,7 +562,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans,
{
return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
(i->iter->btree_id == BTREE_ID_EXTENTS ||
- i->iter->btree_id == BTREE_ID_INODES);
+ i->iter->btree_id == BTREE_ID_INODES ||
+ i->iter->btree_id == BTREE_ID_REFLINK);
}
static inline bool update_has_triggers(struct btree_trans *trans,
@@ -542,7 +583,6 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
- bool saw_non_marked;
unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
? BCH_BUCKET_MARK_BUCKET_INVALIDATE
: 0;
@@ -551,31 +591,31 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
trans_for_each_update_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+ /*
+ * note: running triggers will append more updates to the list of
+ * updates as we're walking it:
+ */
trans_for_each_update_iter(trans, i)
- i->marked = false;
+ if (update_has_triggers(trans, i) &&
+ update_triggers_transactional(trans, i)) {
+ ret = bch2_trans_mark_update(trans, i->iter, i->k);
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->ip);
+ if (ret)
+ goto out_clear_replicas;
+ }
- do {
- saw_non_marked = false;
+ trans_for_each_update(trans, i)
+ btree_insert_entry_checks(trans, i);
+ bch2_btree_trans_verify_locks(trans);
- trans_for_each_update_iter(trans, i) {
- if (i->marked)
- continue;
-
- saw_non_marked = true;
- i->marked = true;
-
- if (update_has_triggers(trans, i) &&
- update_triggers_transactional(trans, i)) {
- ret = bch2_trans_mark_update(trans, i->iter, i->k);
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip);
- if (ret)
- goto out_clear_replicas;
- }
- }
- } while (saw_non_marked);
+ /*
+ * No more updates can be added - sort updates so we can take write
+ * locks in the correct order:
+ */
+ btree_trans_sort_updates(trans);
- btree_trans_lock_write(c, trans);
+ btree_trans_lock_write(trans, true);
if (race_fault()) {
ret = -EINTR;
@@ -593,8 +633,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
goto out;
trans_for_each_update_iter(trans, i) {
- if (i->deferred ||
- !btree_node_type_needs_gc(i->iter->btree_id))
+ if (!btree_node_type_needs_gc(i->iter->btree_id))
continue;
if (!fs_usage) {
@@ -660,7 +699,7 @@ out:
(trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
trans->journal_res.ref);
- btree_trans_unlock_write(trans);
+ btree_trans_lock_write(trans, false);
if (fs_usage) {
bch2_fs_usage_scratch_put(c, fs_usage);
@@ -685,19 +724,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
unsigned flags = trans->flags;
- struct btree_insert_entry *src, *dst;
-
- src = dst = trans->updates;
-
- while (src < trans->updates + trans->nr_updates) {
- if (!src->triggered) {
- *dst = *src;
- dst++;
- }
- src++;
- }
-
- trans->nr_updates = dst - trans->updates;
/*
* BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
@@ -812,6 +838,7 @@ static int __bch2_trans_commit(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ unsigned iter;
int ret;
trans_for_each_update_iter(trans, i) {
@@ -833,8 +860,10 @@ static int __bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
- trans_for_each_update_leaf(trans, i)
- bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+ trans_for_each_update_sorted(trans, i, iter)
+ if (!same_leaf_as_prev(trans, iter))
+ bch2_foreground_maybe_merge(c, i->iter,
+ 0, trans->flags);
trans->nounlock = false;
@@ -853,8 +882,9 @@ int bch2_trans_commit(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned orig_mem_top = trans->mem_top;
+ struct btree_insert_entry *i = NULL;
+ unsigned orig_nr_updates = trans->nr_updates;
+ unsigned orig_mem_top = trans->mem_top;
int ret = 0;
if (!trans->nr_updates)
@@ -875,10 +905,6 @@ int bch2_trans_commit(struct btree_trans *trans,
trans->journal_seq = journal_seq;
trans->flags = flags;
- trans_for_each_update(trans, i)
- btree_insert_entry_checks(trans, i);
- bch2_btree_trans_verify_locks(trans);
-
if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
!percpu_ref_tryget(&c->writes))) {
if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
@@ -923,8 +949,6 @@ out_noupdates:
bch2_trans_unlink_iters(trans, ~trans->iters_touched|
trans->iters_unlink_on_commit);
trans->iters_touched = 0;
- } else {
- bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
}
trans->nr_updates = 0;
trans->mem_top = 0;
@@ -933,39 +957,20 @@ out_noupdates:
err:
ret = bch2_trans_commit_error(trans, i, ret);
+ /* free updates and memory used by triggers, they'll be reexecuted: */
+ trans->nr_updates = orig_nr_updates;
+ trans->mem_top = orig_mem_top;
+
/* can't loop if it was passed in and we changed it: */
if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
ret = -EINTR;
- if (!ret) {
- /* free memory used by triggers, they'll be reexecuted: */
- trans->mem_top = orig_mem_top;
+ if (!ret)
goto retry;
- }
goto out;
}
-struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans,
- struct btree_insert_entry entry)
-{
- struct btree_insert_entry *i;
-
- BUG_ON(trans->nr_updates >= trans->nr_iters + 4);
-
- for (i = trans->updates;
- i < trans->updates + trans->nr_updates;
- i++)
- if (btree_trans_cmp(entry, *i) < 0)
- break;
-
- memmove(&i[1], &i[0],
- (void *) &trans->updates[trans->nr_updates] - (void *) i);
- trans->nr_updates++;
- *i = entry;
- return i;
-}
-
/**
* bch2_btree_insert - insert keys into the extent btree
* @c: pointer to struct bch_fs
@@ -1033,7 +1038,10 @@ retry:
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k);
- bch2_extent_trim_atomic(&delete, iter);
+
+ ret = bch2_extent_trim_atomic(&delete, iter);
+ if (ret)
+ break;
}
bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b6b3ac5111ca..6a4773a92029 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -405,7 +405,8 @@ int bch2_fs_usage_apply(struct bch_fs *c,
*/
should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
if (WARN_ONCE(should_not_have_added > 0,
- "disk usage increased without a reservation")) {
+ "disk usage increased by %lli without a reservation",
+ should_not_have_added)) {
atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added;
ret = -1;
@@ -444,12 +445,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
percpu_rwsem_assert_held(&c->mark_lock);
- bch2_fs_inconsistent_on(old.data_type && new.data_type &&
- old.data_type != new.data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_types[old.data_type],
- bch2_data_types[new.data_type]);
-
preempt_disable();
dev_usage = this_cpu_ptr(ca->usage[gc]);
@@ -504,14 +499,6 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c)
}
}
-#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \
-({ \
- struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
- \
- bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \
- _old; \
-})
-
static inline void update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
@@ -520,7 +507,6 @@ static inline void update_replicas(struct bch_fs *c,
int idx = bch2_replicas_entry_idx(c, r);
BUG_ON(idx < 0);
- BUG_ON(!sectors);
switch (r->data_type) {
case BCH_DATA_BTREE:
@@ -569,8 +555,12 @@ static inline void update_replicas_list(struct btree_trans *trans,
{
struct replicas_delta_list *d;
struct replicas_delta *n;
- unsigned b = replicas_entry_bytes(r) + 8;
+ unsigned b;
+
+ if (!sectors)
+ return;
+ b = replicas_entry_bytes(r) + 8;
d = replicas_deltas_realloc(trans, b);
n = (void *) d->d + d->used;
@@ -629,17 +619,18 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
- old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ old = bucket_cmpxchg(g, new, ({
BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = true;
- new.dirty = true;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
new.gen++;
}));
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
if (old.cached_sectors)
update_cached_sectors(c, fs_usage, ca->dev_idx,
-((s64) old.cached_sectors));
@@ -668,10 +659,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
- old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ old = bucket_cmpxchg(g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
@@ -773,11 +766,16 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
old = bucket_cmpxchg(g, new, ({
- new.dirty = true;
new.data_type = type;
overflow = checked_add(new.dirty_sectors, sectors);
}));
+ bch2_fs_inconsistent_on(old.data_type &&
+ old.data_type != type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_types[old.data_type],
+ bch2_data_types[type]);
+
bch2_fs_inconsistent_on(overflow, c,
"bucket sector count overflow: %u + %u > U16_MAX",
old.dirty_sectors, sectors);
@@ -810,23 +808,24 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
}
static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
- s64 delta)
+ unsigned offset, s64 delta,
+ unsigned flags)
{
- if (delta > 0) {
- /*
- * marking a new extent, which _will have size_ @delta
- *
- * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
- * case, we haven't actually created the key we'll be inserting
- * yet (for the split) - so we don't want to be using
- * k->size/crc.live_size here:
- */
- return __ptr_disk_sectors(p, delta);
+ if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+ BUG_ON(offset + -delta > p.crc.live_size);
+
+ return -((s64) ptr_disk_sectors(p)) +
+ __ptr_disk_sectors(p, offset) +
+ __ptr_disk_sectors(p, p.crc.live_size -
+ offset + delta);
+ } else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+ BUG_ON(offset + -delta > p.crc.live_size);
+
+ return -((s64) ptr_disk_sectors(p)) +
+ __ptr_disk_sectors(p, p.crc.live_size +
+ delta);
} else {
- BUG_ON(-delta > p.crc.live_size);
-
- return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
- (s64) ptr_disk_sectors(p);
+ return ptr_disk_sectors(p);
}
}
@@ -846,16 +845,35 @@ static void bucket_set_stripe(struct bch_fs *c,
struct bucket *g = PTR_BUCKET(ca, ptr, gc);
struct bucket_mark new, old;
- BUG_ON(ptr_stale(ca, ptr));
-
- old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
- new.dirty = true;
+ old = bucket_cmpxchg(g, new, ({
new.stripe = enabled;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
}));
+
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
+ /*
+ * XXX write repair code for these, flag stripe as possibly bad
+ */
+ if (old.gen != ptr->gen)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "stripe with stale pointer");
+#if 0
+ /*
+ * We'd like to check for these, but these checks don't work
+ * yet:
+ */
+ if (old.stripe && enabled)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "multiple stripes using same bucket");
+
+ if (!old.stripe && !enabled)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "deleting stripe but bucket not marked as stripe bucket");
+#endif
}
}
@@ -876,17 +894,23 @@ static bool bch2_mark_pointer(struct bch_fs *c,
do {
new.v.counter = old.v.counter = v;
- new.dirty = true;
-
/*
* Check this after reading bucket mark to guard against
* the allocator invalidating a bucket after we've already
* checked the gen
*/
- if (gen_after(new.gen, p.ptr.gen)) {
- BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
- EBUG_ON(!p.ptr.cached &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+ if (gen_after(p.ptr.gen, new.gen)) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "pointer gen in the future");
+ return true;
+ }
+
+ if (new.gen != p.ptr.gen) {
+ /* XXX write repair code for this */
+ if (!p.ptr.cached &&
+ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "stale dirty pointer");
return true;
}
@@ -915,6 +939,14 @@ static bool bch2_mark_pointer(struct bch_fs *c,
old.v.counter,
new.v.counter)) != old.v.counter);
+ if (old.data_type && old.data_type != data_type)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ new.gen,
+ bch2_data_types[old.data_type],
+ bch2_data_types[data_type]);
+
bch2_fs_inconsistent_on(overflow, c,
"bucket sector count overflow: %u + %lli > U16_MAX",
!p.ptr.cached
@@ -950,7 +982,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx);
- return -1;
+ return -EIO;
}
BUG_ON(m->r.e.data_type != data_type);
@@ -985,7 +1017,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
}
static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, enum bch_data_type data_type,
+ unsigned offset, s64 sectors,
+ enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags)
{
@@ -1006,12 +1039,12 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
- : ptr_disk_sectors_delta(p, sectors);
+ : ptr_disk_sectors_delta(p, offset, sectors, flags);
bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
fs_usage, journal_seq, flags);
if (p.ptr.cached) {
- if (disk_sectors && !stale)
+ if (!stale)
update_cached_sectors(c, fs_usage, p.ptr.dev,
disk_sectors);
} else if (!p.ec_nr) {
@@ -1030,8 +1063,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
}
}
- if (dirty_sectors)
- update_replicas(c, fs_usage, &r.e, dirty_sectors);
+ update_replicas(c, fs_usage, &r.e, dirty_sectors);
return 0;
}
@@ -1095,7 +1127,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
}
int bch2_mark_key_locked(struct bch_fs *c,
- struct bkey_s_c k, s64 sectors,
+ struct bkey_s_c k,
+ unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
@@ -1116,11 +1149,12 @@ int bch2_mark_key_locked(struct bch_fs *c,
? c->opts.btree_node_size
: -c->opts.btree_node_size;
- ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE,
+ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
fs_usage, journal_seq, flags);
break;
case KEY_TYPE_extent:
- ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+ case KEY_TYPE_reflink_v:
+ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
fs_usage, journal_seq, flags);
break;
case KEY_TYPE_stripe:
@@ -1151,14 +1185,14 @@ int bch2_mark_key_locked(struct bch_fs *c,
}
int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors,
+ unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
int ret;
percpu_down_read(&c->mark_lock);
- ret = bch2_mark_key_locked(c, k, sectors,
+ ret = bch2_mark_key_locked(c, k, offset, sectors,
fs_usage, journal_seq, flags);
percpu_up_read(&c->mark_lock);
@@ -1174,8 +1208,11 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
+ unsigned offset = 0;
s64 sectors = 0;
+ flags |= BCH_BUCKET_MARK_OVERWRITE;
+
if (btree_node_is_extents(b)
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
@@ -1184,35 +1221,33 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
if (btree_node_is_extents(b)) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
+ offset = 0;
sectors = -((s64) old.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
sectors = bkey_start_offset(&new->k) -
old.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
+ offset = 0;
sectors = bkey_start_offset(old.k) -
new->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
- sectors = old.k->p.offset - new->k.p.offset;
- BUG_ON(sectors <= 0);
-
- bch2_mark_key_locked(c, old, sectors,
- fs_usage, trans->journal_res.seq,
- BCH_BUCKET_MARK_INSERT|flags);
-
- sectors = bkey_start_offset(&new->k) -
- old.k->p.offset;
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = -((s64) new->k.size);
+ flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
break;
}
BUG_ON(sectors >= 0);
}
- return bch2_mark_key_locked(c, old, sectors, fs_usage,
- trans->journal_res.seq,
- BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1;
+ return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
+ trans->journal_res.seq, flags) ?: 1;
}
int bch2_mark_update(struct btree_trans *trans,
@@ -1230,12 +1265,10 @@ int bch2_mark_update(struct btree_trans *trans,
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
- bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
- bpos_min(insert->k->k.p, b->key.k.p).offset -
- bkey_start_offset(&insert->k->k),
- fs_usage, trans->journal_res.seq,
- BCH_BUCKET_MARK_INSERT|flags);
+ bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
+ 0, insert->k->k.size,
+ fs_usage, trans->journal_res.seq,
+ BCH_BUCKET_MARK_INSERT|flags);
if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
return 0;
@@ -1280,7 +1313,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
xchg(&warned_disk_usage, 1))
return;
- pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+ bch_err(c, "disk usage increased more than %llu sectors reserved",
+ disk_res_sectors);
trans_for_each_update_iter(trans, i) {
struct btree_iter *iter = i->iter;
@@ -1295,7 +1329,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
node_iter = iter->l[0].iter;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
+ KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
@@ -1321,16 +1355,18 @@ static int trans_get_key(struct btree_trans *trans,
struct btree_iter **iter,
struct bkey_s_c *k)
{
- unsigned i;
+ struct btree_insert_entry *i;
int ret;
- for (i = 0; i < trans->nr_updates; i++)
- if (!trans->updates[i].deferred &&
- trans->updates[i].iter->btree_id == btree_id &&
- !bkey_cmp(pos, trans->updates[i].iter->pos)) {
- *iter = trans->updates[i].iter;
- *k = bkey_i_to_s_c(trans->updates[i].k);
- return 0;
+ trans_for_each_update_iter(trans, i)
+ if (i->iter->btree_id == btree_id &&
+ (btree_node_type_is_extents(btree_id)
+ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
+ bkey_cmp(pos, i->k->k.p) < 0
+ : !bkey_cmp(pos, i->iter->pos))) {
+ *iter = i->iter;
+ *k = bkey_i_to_s_c(i->k);
+ return 1;
}
*iter = __bch2_trans_get_iter(trans, btree_id, pos,
@@ -1338,6 +1374,8 @@ static int trans_get_key(struct btree_trans *trans,
if (IS_ERR(*iter))
return PTR_ERR(*iter);
+ bch2_trans_iter_free_on_commit(trans, *iter);
+
*k = bch2_btree_iter_peek_slot(*iter);
ret = bkey_err(*k);
if (ret)
@@ -1349,8 +1387,8 @@ static void *trans_update_key(struct btree_trans *trans,
struct btree_iter *iter,
unsigned u64s)
{
+ struct btree_insert_entry *i;
struct bkey_i *new_k;
- unsigned i;
new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
if (IS_ERR(new_k))
@@ -1359,19 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans,
bkey_init(&new_k->k);
new_k->k.p = iter->pos;
- for (i = 0; i < trans->nr_updates; i++)
- if (!trans->updates[i].deferred &&
- trans->updates[i].iter == iter) {
- trans->updates[i].k = new_k;
+ trans_for_each_update_iter(trans, i)
+ if (i->iter == iter) {
+ i->k = new_k;
return new_k;
}
- bch2_trans_update(trans, ((struct btree_insert_entry) {
- .iter = iter,
- .k = new_k,
- .triggered = true,
- }));
-
+ bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k));
return new_k;
}
@@ -1385,43 +1417,76 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
+ unsigned old;
bool overflow;
int ret;
ret = trans_get_key(trans, BTREE_ID_ALLOC,
POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)),
&iter, &k);
- if (ret)
+ if (ret < 0)
return ret;
- if (k.k->type != KEY_TYPE_alloc) {
- bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu",
- p.ptr.dev,
- PTR_BUCKET_NR(ca, &p.ptr));
- ret = -1;
- goto out;
- }
+ if (!ret) {
+ /*
+ * During journal replay, and if gc repairs alloc info at
+ * runtime, the alloc info in the btree might not be up to date
+ * yet - so, trust the in memory mark:
+ */
+ struct bucket *g;
+ struct bucket_mark m;
- u = bch2_alloc_unpack(k);
+ percpu_down_read(&c->mark_lock);
+ g = bucket(ca, iter->pos.offset);
+ m = READ_ONCE(g->mark);
+ u = alloc_mem_to_key(g, m);
+ percpu_up_read(&c->mark_lock);
+ } else {
+ /*
+ * Unless we're already updating that key:
+ */
+ if (k.k->type != KEY_TYPE_alloc) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "pointer to nonexistent bucket %llu:%llu",
+ iter->pos.inode, iter->pos.offset);
+ ret = -1;
+ goto out;
+ }
+
+ u = bch2_alloc_unpack(k);
+ }
if (gen_after(u.gen, p.ptr.gen)) {
ret = 1;
goto out;
}
- if (!p.ptr.cached)
+ if (u.data_type && u.data_type != data_type) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s",
+ iter->pos.inode, iter->pos.offset,
+ u.gen,
+ bch2_data_types[u.data_type],
+ bch2_data_types[data_type]);
+ ret = -1;
+ goto out;
+ }
+
+ if (!p.ptr.cached) {
+ old = u.dirty_sectors;
overflow = checked_add(u.dirty_sectors, sectors);
- else
+ } else {
+ old = u.cached_sectors;
overflow = checked_add(u.cached_sectors, sectors);
+ }
u.data_type = u.dirty_sectors || u.cached_sectors
? data_type : 0;
bch2_fs_inconsistent_on(overflow, c,
"bucket sector count overflow: %u + %lli > U16_MAX",
- !p.ptr.cached
- ? u.dirty_sectors
- : u.cached_sectors, sectors);
+ old, sectors);
+ BUG_ON(overflow);
a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
ret = PTR_ERR_OR_ZERO(a);
@@ -1440,6 +1505,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p,
s64 sectors, enum bch_data_type data_type)
{
+ struct bch_fs *c = trans->c;
struct bch_replicas_padded r;
struct btree_iter *iter;
struct bkey_i *new_k;
@@ -1449,17 +1515,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
s64 parity_sectors;
int ret = 0;
- BUG_ON(!sectors);
-
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
- if (ret)
+ if (ret < 0)
return ret;
if (k.k->type != KEY_TYPE_stripe) {
- bch_err_ratelimited(trans->c,
- "pointer to nonexistent stripe %llu",
- (u64) p.idx);
- ret = -1;
+ bch2_fs_inconsistent(c,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.idx);
+ ret = -EIO;
goto out;
}
@@ -1491,8 +1555,9 @@ out:
}
static int bch2_trans_mark_extent(struct btree_trans *trans,
- struct bkey_s_c k,
- s64 sectors, enum bch_data_type data_type)
+ struct bkey_s_c k, unsigned offset,
+ s64 sectors, unsigned flags,
+ enum bch_data_type data_type)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1512,7 +1577,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
- : ptr_disk_sectors_delta(p, sectors);
+ : ptr_disk_sectors_delta(p, offset, sectors, flags);
ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
data_type);
@@ -1522,7 +1587,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
stale = ret > 0;
if (p.ptr.cached) {
- if (disk_sectors && !stale)
+ if (!stale)
update_cached_sectors_list(trans, p.ptr.dev,
disk_sectors);
} else if (!p.ec_nr) {
@@ -1540,15 +1605,92 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
}
}
- if (dirty_sectors)
- update_replicas_list(trans, &r.e, dirty_sectors);
+ update_replicas_list(trans, &r.e, dirty_sectors);
return 0;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 idx, unsigned sectors,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_i *new_k;
+ struct bkey_s_c k;
+ struct bkey_i_reflink_v *r_v;
+ s64 ret;
+
+ ret = trans_get_key(trans, BTREE_ID_REFLINK,
+ POS(0, idx), &iter, &k);
+ if (ret < 0)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_reflink_v) {
+ bch2_fs_inconsistent(c,
+ "%llu:%llu len %u points to nonexistent indirect extent %llu",
+ p.k->p.inode, p.k->p.offset, p.k->size, idx);
+ ret = -EIO;
+ goto err;
+ }
+
+ if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+ (bkey_start_offset(k.k) < idx ||
+ k.k->p.offset > idx + sectors))
+ goto out;
+
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+ new_k = trans_update_key(trans, iter, k.k->u64s);
+ ret = PTR_ERR_OR_ZERO(new_k);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(new_k, k);
+ r_v = bkey_i_to_reflink_v(new_k);
+
+ le64_add_cpu(&r_v->v.refcount,
+ !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+
+ if (!r_v->v.refcount) {
+ r_v->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&r_v->k, 0);
+ }
+out:
+ ret = k.k->p.offset - idx;
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p, unsigned offset,
s64 sectors, unsigned flags)
{
+ u64 idx = le64_to_cpu(p.v->idx) + offset;
+ s64 ret = 0;
+
+ sectors = abs(sectors);
+ BUG_ON(offset + sectors > p.k->size);
+
+ while (sectors) {
+ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+ if (ret < 0)
+ break;
+
+ idx += ret;
+ sectors = max_t(s64, 0LL, sectors - ret);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+ unsigned offset, s64 sectors, unsigned flags)
+{
struct replicas_delta_list *d;
struct bch_fs *c = trans->c;
@@ -1558,11 +1700,12 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
? c->opts.btree_node_size
: -c->opts.btree_node_size;
- return bch2_trans_mark_extent(trans, k, sectors,
- BCH_DATA_BTREE);
+ return bch2_trans_mark_extent(trans, k, offset, sectors,
+ flags, BCH_DATA_BTREE);
case KEY_TYPE_extent:
- return bch2_trans_mark_extent(trans, k, sectors,
- BCH_DATA_USER);
+ case KEY_TYPE_reflink_v:
+ return bch2_trans_mark_extent(trans, k, offset, sectors,
+ flags, BCH_DATA_USER);
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);
@@ -1584,6 +1727,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
d->fs_usage.persistent_reserved[replicas - 1] += sectors;
return 0;
}
+ case KEY_TYPE_reflink_p:
+ return bch2_trans_mark_reflink_p(trans,
+ bkey_s_c_to_reflink_p(k),
+ offset, sectors, flags);
default:
return 0;
}
@@ -1601,19 +1748,21 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(insert),
- bpos_min(insert->k.p, b->key.k.p).offset -
- bkey_start_offset(&insert->k),
- BCH_BUCKET_MARK_INSERT);
+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
+ 0, insert->k.size, BCH_BUCKET_MARK_INSERT);
if (ret)
return ret;
+ if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+ return 0;
+
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
+ unsigned offset = 0;
s64 sectors = 0;
+ unsigned flags = BCH_BUCKET_MARK_OVERWRITE;
k = bkey_disassemble(b, _k, &unpacked);
@@ -1625,35 +1774,32 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (btree_node_is_extents(b)) {
switch (bch2_extent_overlap(&insert->k, k.k)) {
case BCH_EXTENT_OVERLAP_ALL:
+ offset = 0;
sectors = -((s64) k.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
sectors = bkey_start_offset(&insert->k) -
k.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
+ offset = 0;
sectors = bkey_start_offset(k.k) -
insert->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
- sectors = k.k->p.offset - insert->k.p.offset;
- BUG_ON(sectors <= 0);
-
- ret = bch2_trans_mark_key(trans, k, sectors,
- BCH_BUCKET_MARK_INSERT);
- if (ret)
- return ret;
-
- sectors = bkey_start_offset(&insert->k) -
- k.k->p.offset;
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
+ sectors = -((s64) insert->k.size);
+ flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
break;
}
BUG_ON(sectors >= 0);
}
- ret = bch2_trans_mark_key(trans, k, sectors,
- BCH_BUCKET_MARK_OVERWRITE);
+ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
if (ret)
return ret;
@@ -1761,7 +1907,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_nouse = NULL;
- unsigned long *buckets_written = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
@@ -1790,9 +1935,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
- !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)) ||
!init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
@@ -1824,16 +1966,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(buckets_nouse,
ca->buckets_nouse,
BITS_TO_LONGS(n) * sizeof(unsigned long));
- memcpy(buckets_written,
- ca->buckets_written,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
swap(ca->buckets_nouse, buckets_nouse);
- swap(ca->buckets_written, buckets_written);
if (resize)
percpu_up_write(&c->mark_lock);
@@ -1873,8 +2011,6 @@ err:
free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
- kvpfree(buckets_written,
- BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (buckets)
call_rcu(&old_buckets->rcu, buckets_free_rcu);
@@ -1890,8 +2026,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
- kvpfree(ca->buckets_written,
- BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 5ab6f3d34137..a4bab66d8d17 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -94,6 +94,15 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
}
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+ const struct bch_extent_ptr *ptr)
+{
+ if (k->type == KEY_TYPE_btree_ptr)
+ return BCH_DATA_BTREE;
+
+ return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
+}
+
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
@@ -251,14 +260,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
#define BCH_BUCKET_MARK_INSERT (1 << 0)
#define BCH_BUCKET_MARK_OVERWRITE (1 << 1)
-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 2)
-#define BCH_BUCKET_MARK_GC (1 << 3)
-#define BCH_BUCKET_MARK_ALLOC_READ (1 << 4)
-#define BCH_BUCKET_MARK_NOATOMIC (1 << 5)
+#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2)
+#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3)
+#define BCH_BUCKET_MARK_GC (1 << 4)
+#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5)
+#define BCH_BUCKET_MARK_NOATOMIC (1 << 6)
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, unsigned);
@@ -272,7 +282,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
void bch2_replicas_delta_list_apply(struct bch_fs *,
struct bch_fs_usage *,
struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+ unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *,
struct btree_iter *iter,
struct bkey_i *insert);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index e51d297976be..94bd9da34847 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -15,7 +15,6 @@ struct bucket_mark {
u8 gen;
u8 data_type:3,
owned_by_allocator:1,
- dirty:1,
journal_seq_valid:1,
stripe:1;
u16 dirty_sectors;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 65b9714a1e58..e55aa98cf9ee 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <keys/user-type.h>
@@ -61,27 +61,27 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
return crc32c(crc, data, len);
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC64:
- return bch2_crc64_update(crc, data, len);
+ return crc64_be(crc, data, len);
default:
BUG();
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -462,7 +463,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -545,7 +546,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -573,7 +574,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -605,7 +606,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 657679f43b02..b84e81bac8ff 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -25,11 +25,6 @@ static inline bool bch2_checksum_mergeable(unsigned type)
struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
struct bch_csum, size_t);
-static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len)
-{
- return crc64_be(crc, p, len);
-}
-
#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
@@ -143,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index a7264d802ed7..3787390da47f 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
#ifndef CONFIG_HIGHMEM
- __bio_for_each_contig_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (bv.bv_len == start.bi_size)
return (struct bbuf) {
.b = page_address(bv.bv_page) + bv.bv_offset,
@@ -241,10 +241,10 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
}
/*
- * might have to free existing pages and retry allocation from mempool -
- * do this _after_ decompressing:
+ * XXX: don't have a good way to assert that the bio was allocated with
+ * enough space, we depend on bch2_move_extent doing the right thing
*/
- bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+ bio->bi_iter.bi_size = crc->live_size << 9;
memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index c758982bc1af..69b123bad83b 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -70,8 +70,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
- bch2_bio_map(bio, n_sorted);
+ bch2_bio_map(bio, n_sorted, btree_bytes(c));
submit_bio_wait(bio);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index dba861111a8d..be2eca0fcdf7 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -162,19 +162,20 @@ static int extent_matches_stripe(struct bch_fs *c,
struct bch_stripe *v,
struct bkey_s_c k)
{
- struct bkey_s_c_extent e;
- const struct bch_extent_ptr *ptr;
- int idx;
- if (!bkey_extent_is_data(k.k))
- return -1;
-
- e = bkey_s_c_to_extent(k);
+ switch (k.k->type) {
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ int idx;
- extent_for_each_ptr(e, ptr) {
- idx = ptr_matches_stripe(c, v, ptr);
- if (idx >= 0)
- return idx;
+ extent_for_each_ptr(e, ptr) {
+ idx = ptr_matches_stripe(c, v, ptr);
+ if (idx >= 0)
+ return idx;
+ }
+ break;
+ }
}
return -1;
@@ -182,19 +183,20 @@ static int extent_matches_stripe(struct bch_fs *c,
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
{
- struct bkey_s_c_extent e;
- const union bch_extent_entry *entry;
-
- if (!bkey_extent_is_data(k.k))
- return false;
+ switch (k.k->type) {
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const union bch_extent_entry *entry;
- e = bkey_s_c_to_extent(k);
+ extent_for_each_entry(e, entry)
+ if (extent_entry_type(entry) ==
+ BCH_EXTENT_ENTRY_stripe_ptr &&
+ entry->stripe_ptr.idx == idx)
+ return true;
- extent_for_each_entry(e, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
+ break;
+ }
+ }
return false;
}
@@ -399,11 +401,10 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bio_set_op_attrs(&ec_bio->bio, rw, 0);
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
- ec_bio->bio.bi_iter.bi_size = b;
ec_bio->bio.bi_end_io = ec_block_endio;
ec_bio->bio.bi_private = cl;
- bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
percpu_ref_get(&ca->io_ref);
@@ -576,7 +577,8 @@ static ssize_t stripe_idx_to_delete(struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
- return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
+ return h->used && h->data[0].blocks_nonempty == 0
+ ? h->data[0].idx : -1;
}
static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
@@ -627,7 +629,8 @@ void bch2_stripes_heap_update(struct bch_fs *c,
bch2_stripes_heap_insert(c, m, idx);
}
- if (stripe_idx_to_delete(c) >= 0)
+ if (stripe_idx_to_delete(c) >= 0 &&
+ !percpu_ref_is_dying(&c->writes))
schedule_work(&c->ec_stripe_delete_work);
}
@@ -685,7 +688,8 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (idx < 0)
break;
- ec_stripe_delete(c, idx);
+ if (ec_stripe_delete(c, idx))
+ break;
}
mutex_unlock(&c->ec_stripe_create_lock);
@@ -700,26 +704,34 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bpos start_pos = POS(0, c->ec_stripe_hint);
int ret;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
- /* XXX: start pos hint */
- for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+ if (start_pos.offset) {
+ start_pos = POS_MIN;
+ bch2_btree_iter_set_pos(iter, start_pos);
+ continue;
+ }
+
+ ret = -ENOSPC;
break;
+ }
if (bkey_deleted(k.k))
goto found_slot;
}
- if (!ret)
- ret = -ENOSPC;
goto err;
found_slot:
+ start_pos = iter->pos;
+
ret = ec_stripe_mem_alloc(c, iter);
if (ret)
goto err;
@@ -734,6 +746,8 @@ found_slot:
err:
if (ret == -EINTR)
goto retry;
+
+ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
bch2_trans_exit(&trans);
return ret;
@@ -1159,12 +1173,8 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
struct ec_stripe_new *s = NULL;
mutex_lock(&h->lock);
- bch2_open_buckets_stop_dev(c, ca,
- &h->blocks,
- BCH_DATA_USER);
- bch2_open_buckets_stop_dev(c, ca,
- &h->parity,
- BCH_DATA_USER);
+ bch2_open_buckets_stop_dev(c, ca, &h->blocks);
+ bch2_open_buckets_stop_dev(c, ca, &h->parity);
if (!h->s)
goto unlock;
@@ -1265,10 +1275,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- struct journal_key *i;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ struct btree_iter *btree_iter;
+ struct journal_iter journal_iter;
+ struct bkey_s_c btree_k, journal_k, k;
int ret;
ret = bch2_fs_ec_start(c);
@@ -1277,10 +1287,41 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret)
- bch2_mark_key(c, k, 0, NULL, 0,
+ btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
+ journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
+
+ btree_k = bch2_btree_iter_peek(btree_iter);
+ journal_k = bch2_journal_iter_peek(&journal_iter);
+
+ while (1) {
+ if (btree_k.k && journal_k.k) {
+ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+
+ if (cmp < 0) {
+ k = btree_k;
+ btree_k = bch2_btree_iter_next(btree_iter);
+ } else if (cmp == 0) {
+ btree_k = bch2_btree_iter_next(btree_iter);
+ k = journal_k;
+ journal_k = bch2_journal_iter_next(&journal_iter);
+ } else {
+ k = journal_k;
+ journal_k = bch2_journal_iter_next(&journal_iter);
+ }
+ } else if (btree_k.k) {
+ k = btree_k;
+ btree_k = bch2_btree_iter_next(btree_iter);
+ } else if (journal_k.k) {
+ k = journal_k;
+ journal_k = bch2_journal_iter_next(&journal_iter);
+ } else {
+ break;
+ }
+
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
+ }
ret = bch2_trans_exit(&trans) ?: ret;
if (ret) {
@@ -1288,13 +1329,6 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
return ret;
}
- for_each_journal_key(*journal_keys, i)
- if (i->btree_id == BTREE_ID_EC)
- bch2_mark_key(c, bkey_i_to_s_c(i->k),
- 0, NULL, 0,
- BCH_BUCKET_MARK_ALLOC_READ|
- BCH_BUCKET_MARK_NOATOMIC);
-
return 0;
}
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 1aaff44e18cf..304ff92500be 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -4,6 +4,8 @@
#include "io.h"
#include "super.h"
+#define FSCK_ERR_RATELIMIT_NR 10
+
bool bch2_inconsistent_error(struct bch_fs *c)
{
set_bit(BCH_FS_ERROR, &c->flags);
@@ -97,8 +99,8 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
found:
list_move(&s->list, &c->fsck_errors);
s->nr++;
- suppressing = s->nr == 10;
- print = s->nr <= 10;
+ suppressing = s->nr == FSCK_ERR_RATELIMIT_NR;
+ print = s->nr <= FSCK_ERR_RATELIMIT_NR;
buf = s->buf;
print:
va_start(args, fmt);
@@ -152,10 +154,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_lock);
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
- if (s->nr > 10)
+ if (s->nr > FSCK_ERR_RATELIMIT_NR)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
list_del(&s->list);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e286048b5bf8..4b1c652cdbce 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -46,7 +46,8 @@ unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
- case KEY_TYPE_extent: {
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v: {
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
@@ -250,6 +251,33 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == dev)
+ return ptr;
+
+ return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (!ptr->cached ||
+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+ return true;
+
+ return false;
+}
+
/* extent specific utility code */
const struct bch_extent_ptr *
@@ -280,50 +308,32 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
return NULL;
}
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
-{
- const struct bch_extent_ptr *ptr;
-
- extent_for_each_ptr(e, ptr)
- if (bch2_dev_in_target(c, ptr->dev, target) &&
- (!ptr->cached ||
- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- return ptr;
-
- return NULL;
-}
-
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
unsigned ret = 0;
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- extent_for_each_ptr_decode(e, p, entry)
- if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_NONE)
- ret += p.crc.compressed_size;
- }
- }
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached &&
+ p.crc.compression_type != BCH_COMPRESSION_NONE)
+ ret += p.crc.compressed_size;
return ret;
}
-bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
- struct bch_extent_ptr m, u64 offset)
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_extent_ptr m, u64 offset)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- extent_for_each_ptr_decode(e, p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev == m.dev &&
p.ptr.gen == m.gen &&
- (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
(s64) m.offset - offset)
return true;
@@ -390,16 +400,17 @@ static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
bch2_csum_type_is_encryption(n.csum_type);
}
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
struct bch_extent_crc_unpacked n)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
if (!n.csum_type)
return false;
- extent_for_each_crc(e, crc, i)
+ bkey_for_each_crc(k.k, ptrs, crc, i)
if (can_narrow_crc(crc, n))
return true;
@@ -415,9 +426,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
* currently live (so that readers won't have to bounce) while we've got the
* checksum we need:
*/
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
- struct bch_extent_crc_unpacked n)
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
struct bch_extent_crc_unpacked u;
struct extent_ptr_decoded p;
union bch_extent_entry *i;
@@ -425,7 +436,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
/* Find a checksum entry that covers only live data: */
if (!n.csum_type) {
- extent_for_each_crc(extent_i_to_s(e), u, i)
+ bkey_for_each_crc(&k->k, ptrs, u, i)
if (!u.compression_type &&
u.csum_type &&
u.live_size == u.uncompressed_size) {
@@ -437,15 +448,17 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
found:
BUG_ON(n.compression_type);
BUG_ON(n.offset);
- BUG_ON(n.live_size != e->k.size);
+ BUG_ON(n.live_size != k->k.size);
restart_narrow_pointers:
- extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
+ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
- bch2_extent_ptr_decoded_append(e, &p);
+ bch2_extent_ptr_decoded_append(k, &p);
ret = true;
goto restart_narrow_pointers;
}
@@ -659,8 +672,7 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
return bch2_bkey_ptrs_invalid(c, k);
}
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
@@ -708,44 +720,48 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
/* Extents */
-bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+void __bch2_cut_front(struct bpos where, struct bkey_s k)
{
- u64 len = 0;
+ u64 sub;
if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
- return false;
+ return;
EBUG_ON(bkey_cmp(where, k.k->p) > 0);
- len = k.k->p.offset - where.offset;
+ sub = where.offset - bkey_start_offset(k.k);
- BUG_ON(len > k.k->size);
+ k.k->size -= sub;
- /*
- * Don't readjust offset if the key size is now 0, because that could
- * cause offset to point to the next bucket:
- */
- if (!len)
+ if (!k.k->size)
k.k->type = KEY_TYPE_deleted;
- else if (bkey_extent_is_data(k.k)) {
- struct bkey_s_extent e = bkey_s_to_extent(k);
+
+ switch (k.k->type) {
+ case KEY_TYPE_deleted:
+ case KEY_TYPE_discard:
+ case KEY_TYPE_error:
+ case KEY_TYPE_cookie:
+ break;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v: {
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
bool seen_crc = false;
- extent_for_each_entry(e, entry) {
+ bkey_extent_entry_for_each(ptrs, entry) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
if (!seen_crc)
- entry->ptr.offset += e.k->size - len;
+ entry->ptr.offset += sub;
break;
case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.offset += e.k->size - len;
+ entry->crc32.offset += sub;
break;
case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.offset += e.k->size - len;
+ entry->crc64.offset += sub;
break;
case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.offset += e.k->size - len;
+ entry->crc128.offset += sub;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
@@ -754,11 +770,20 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k)
if (extent_entry_is_crc(entry))
seen_crc = true;
}
- }
- k.k->size = len;
+ break;
+ }
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
- return true;
+ le64_add_cpu(&p.v->idx, sub);
+ break;
+ }
+ case KEY_TYPE_reservation:
+ break;
+ default:
+ BUG();
+ }
}
bool bch2_cut_back(struct bpos where, struct bkey *k)
@@ -772,8 +797,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
len = where.offset - bkey_start_offset(k);
- BUG_ON(len > k->size);
-
k->p = where;
k->size = len;
@@ -783,19 +806,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
return true;
}
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-void bch2_key_resize(struct bkey *k,
- unsigned new_size)
-{
- k->p.offset -= k->size;
- k->p.offset += new_size;
- k->size = new_size;
-}
-
static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
struct bkey_i *src)
{
@@ -866,13 +876,6 @@ static void verify_extent_nonoverlapping(struct bch_fs *c,
#endif
}
-static void verify_modified_extent(struct btree_iter *iter,
- struct bkey_packed *k)
-{
- bch2_btree_iter_verify(iter, iter->l[0].b);
- bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
-}
-
static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert)
{
@@ -885,6 +888,9 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+ if (debug_check_bkeys(c))
+ bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
node_iter = l->iter;
k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
if (k && !bkey_written(l->b, k) &&
@@ -897,11 +903,20 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
return;
+ /*
+ * may have skipped past some deleted extents greater than the insert
+ * key, before we got to a non deleted extent and knew we could bail out
+ * rewind the iterator a bit if necessary:
+ */
+ node_iter = l->iter;
+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+ bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
+ l->iter = node_iter;
+
k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
bch2_bset_insert(l->b, &l->iter, k, insert, 0);
bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
- bch2_btree_iter_verify(iter, l->b);
}
static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
@@ -921,47 +936,132 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
return ret;
}
-static inline struct bpos
-bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter)
+static int __bch2_extent_atomic_end(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned offset,
+ struct bpos *end,
+ unsigned *nr_iters,
+ unsigned max_iters)
+{
+ int ret = 0;
+
+ switch (k.k->type) {
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ return 0;
+ }
+
+ break;
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx = le64_to_cpu(p.v->idx);
+ unsigned sectors = end->offset - bkey_start_offset(p.k);
+ struct btree_iter *iter;
+ struct bkey_s_c r_k;
+
+ for_each_btree_key(trans, iter,
+ BTREE_ID_REFLINK, POS(0, idx + offset),
+ BTREE_ITER_SLOTS, r_k, ret) {
+ if (bkey_cmp(bkey_start_pos(r_k.k),
+ POS(0, idx + sectors)) >= 0)
+ break;
+
+ *nr_iters += 1;
+ if (*nr_iters >= max_iters) {
+ struct bpos pos = bkey_start_pos(k.k);
+ pos.offset += r_k.k->p.offset - idx;
+
+ *end = bpos_min(*end, pos);
+ break;
+ }
+ }
+
+ bch2_trans_iter_put(trans, iter);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int bch2_extent_atomic_end(struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bpos *end)
{
+ struct btree_trans *trans = iter->trans;
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
struct bkey_packed *_k;
- unsigned nr_alloc_ptrs =
+ unsigned nr_iters =
bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
+ int ret = 0;
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ *end = bpos_min(insert->k.p, b->key.k.p);
+
+ ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert),
+ 0, end, &nr_iters, 10);
+ if (ret)
+ return ret;
+
+ while (nr_iters < 20 &&
+ (_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+ unsigned offset = 0;
- if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
break;
- nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k);
+ if (bkey_cmp(bkey_start_pos(&insert->k),
+ bkey_start_pos(k.k)) > 0)
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
- if (nr_alloc_ptrs > 20) {
- BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0);
- return bpos_min(insert->k.p, k.k->p);
- }
+ ret = __bch2_extent_atomic_end(trans, k, offset,
+ end, &nr_iters, 20);
+ if (ret)
+ return ret;
+
+ if (nr_iters >= 20)
+ break;
bch2_btree_node_iter_advance(&node_iter, b);
}
- return bpos_min(insert->k.p, b->key.k.p);
+ return 0;
}
-void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
{
- bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k);
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(iter, k, &end);
+ if (ret)
+ return ret;
+
+ bch2_cut_back(end, &k->k);
+ return 0;
}
-bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
{
- return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p);
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(iter, k, &end);
+ if (ret)
+ return ret;
+
+ return !bkey_cmp(end, k->k.p);
}
enum btree_insert_ret
@@ -1031,15 +1131,16 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
case BCH_EXTENT_OVERLAP_FRONT:
/* insert overlaps with start of k: */
__bch2_cut_front(insert->k.p, k);
- BUG_ON(bkey_deleted(k.k));
+ EBUG_ON(bkey_deleted(k.k));
extent_save(l->b, _k, k.k);
- verify_modified_extent(iter, _k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+ _k, _k->u64s, _k->u64s);
break;
case BCH_EXTENT_OVERLAP_BACK:
/* insert overlaps with end of k: */
bch2_cut_back(bkey_start_pos(&insert->k), k.k);
- BUG_ON(bkey_deleted(k.k));
+ EBUG_ON(bkey_deleted(k.k));
extent_save(l->b, _k, k.k);
/*
@@ -1050,7 +1151,6 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
bch2_bset_fix_invalidated_key(l->b, _k);
bch2_btree_node_iter_fix(iter, l->b, &l->iter,
_k, _k->u64s, _k->u64s);
- verify_modified_extent(iter, _k);
break;
case BCH_EXTENT_OVERLAP_ALL: {
@@ -1067,12 +1167,10 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
bch2_bset_delete(l->b, _k, _k->u64s);
bch2_btree_node_iter_fix(iter, l->b, &l->iter,
_k, u64s, 0);
- bch2_btree_iter_verify(iter, l->b);
} else {
extent_save(l->b, _k, k.k);
bch2_btree_node_iter_fix(iter, l->b, &l->iter,
_k, _k->u64s, _k->u64s);
- verify_modified_extent(iter, _k);
}
break;
@@ -1102,7 +1200,8 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
__bch2_cut_front(insert->k.p, k);
BUG_ON(bkey_deleted(k.k));
extent_save(l->b, _k, k.k);
- verify_modified_extent(iter, _k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+ _k, _k->u64s, _k->u64s);
extent_bset_insert(c, iter, &split.k);
break;
@@ -1159,6 +1258,8 @@ static void __bch2_insert_fixup_extent(struct bch_fs *c,
btree_account_key_drop(l->b, _k);
_k->type = KEY_TYPE_discard;
reserve_whiteout(l->b, _k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+ _k, _k->u64s, _k->u64s);
}
break;
}
@@ -1185,19 +1286,6 @@ next:
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
}
-
- /*
- * may have skipped past some deleted extents greater than the insert
- * key, before we got to a non deleted extent and knew we could bail out
- * rewind the iterator a bit if necessary:
- */
- {
- struct btree_node_iter node_iter = l->iter;
-
- while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
- bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0)
- l->iter = node_iter;
- }
}
/**
@@ -1265,12 +1353,7 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
if (s.deleting)
tmp.k.k.type = KEY_TYPE_discard;
-#if 0
- /* disabled due to lock recursion - mark_lock: */
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, iter->l[0].b,
- bkey_i_to_s_c(&tmp.k));
-#endif
+
EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
extent_bset_insert(c, iter, &tmp.k);
@@ -1295,8 +1378,7 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
return bch2_bkey_ptrs_invalid(c, k);
}
-void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
@@ -1312,11 +1394,13 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
* going to get overwritten during replay)
*/
- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked(c, e.s_c, false), c,
- "extent key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-
+ if (percpu_down_read_trylock(&c->mark_lock)) {
+ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+ "extent key bad (replicas not marked in superblock):\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+ percpu_up_read(&c->mark_lock);
+ }
/*
* If journal replay hasn't finished, we might be seeing keys
* that will be overwritten by the time journal replay is done:
@@ -1394,9 +1478,12 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
#undef set_common_fields
}
-static void bch2_extent_crc_init(union bch_extent_crc *crc,
- struct bch_extent_crc_unpacked new)
+static void bch2_extent_crc_append(struct bkey_i *k,
+ struct bch_extent_crc_unpacked new)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ union bch_extent_crc *crc = (void *) ptrs.end;
+
if (bch_crc_bytes[new.csum_type] <= 4 &&
new.uncompressed_size - 1 <= CRC32_SIZE_MAX &&
new.nonce <= CRC32_NONCE_MAX)
@@ -1413,54 +1500,53 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc,
BUG();
bch2_extent_crc_pack(crc, new);
-}
-void bch2_extent_crc_append(struct bkey_i_extent *e,
- struct bch_extent_crc_unpacked new)
-{
- bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
- __extent_entry_push(e);
+ k->k.u64s += extent_entry_u64s(ptrs.end);
+
+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
}
-static inline void __extent_entry_insert(struct bkey_i_extent *e,
+static inline void __extent_entry_insert(struct bkey_i *k,
union bch_extent_entry *dst,
union bch_extent_entry *new)
{
- union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
dst, (u64 *) end - (u64 *) dst);
- e->k.u64s += extent_entry_u64s(new);
+ k->k.u64s += extent_entry_u64s(new);
memcpy(dst, new, extent_entry_bytes(new));
}
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
struct extent_ptr_decoded *p)
{
- struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(&k->k, NULL);
union bch_extent_entry *pos;
unsigned i;
if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = e->v.start;
+ pos = ptrs.start;
goto found;
}
- extent_for_each_crc(extent_i_to_s(e), crc, pos)
+ bkey_for_each_crc(&k->k, ptrs, crc, pos)
if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
pos = extent_entry_next(pos);
goto found;
}
- bch2_extent_crc_append(e, p->crc);
- pos = extent_entry_last(extent_i_to_s(e));
+ bch2_extent_crc_append(k, p->crc);
+ pos = bkey_val_end(bkey_i_to_s(k));
found:
p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- __extent_entry_insert(e, pos, to_entry(&p->ptr));
+ __extent_entry_insert(k, pos, to_entry(&p->ptr));
for (i = 0; i < p->ec_nr; i++) {
p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
- __extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+ __extent_entry_insert(k, pos, to_entry(&p->ec[i]));
}
}
@@ -1482,22 +1568,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
/* will only happen if all pointers were cached: */
if (!bkey_val_u64s(k.k))
- k.k->type = KEY_TYPE_deleted;
+ k.k->type = KEY_TYPE_discard;
- return false;
+ return bkey_whiteout(k.k);
}
-void bch2_extent_mark_replicas_cached(struct bch_fs *c,
- struct bkey_s_extent e,
- unsigned target,
- unsigned nr_desired_replicas)
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+ unsigned target,
+ unsigned nr_desired_replicas)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
if (target && extra > 0)
- extent_for_each_ptr_decode(e, p, entry) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra &&
@@ -1508,7 +1594,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
}
if (extra > 0)
- extent_for_each_ptr_decode(e, p, entry) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra) {
@@ -1666,6 +1752,12 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
if (ret == BCH_MERGE_NOMERGE)
return false;
+ if (debug_check_bkeys(c))
+ bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k));
+ if (debug_check_bkeys(c) &&
+ ret == BCH_MERGE_PARTIAL)
+ bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k));
+
/*
* check if we overlap with deleted extents - would break the sort
* order:
@@ -1702,7 +1794,6 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
bch2_bset_fix_invalidated_key(b, m);
bch2_btree_node_iter_fix(iter, b, node_iter,
m, m->u64s, m->u64s);
- verify_modified_extent(iter, m);
return ret == BCH_MERGE_MERGE;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index fe92737354bd..613d76af69d9 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -12,7 +12,8 @@ struct btree_insert_entry;
/* extent entries: */
-#define extent_entry_last(_e) bkey_val_end(_e)
+#define extent_entry_last(_e) \
+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
#define entry_to_ptr(_entry) \
({ \
@@ -258,6 +259,27 @@ out: \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+({ \
+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \
+ if (extent_entry_is_crc(_iter)) { \
+ (_crc) = bch2_extent_crc_unpack(_k, \
+ entry_to_crc(_iter)); \
+ break; \
+ } \
+ \
+ (_iter) < (_end); \
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
+ (_iter) = (_start); \
+ bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter) \
+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
/* utility code common to all keys with pointers: */
static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
@@ -267,7 +289,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
return (struct bkey_ptrs_c) {
to_entry(&e.v->start[0]),
- to_entry(bkey_val_end(e))
+ to_entry(extent_entry_last(e))
};
}
case KEY_TYPE_extent: {
@@ -284,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
to_entry(&s.v->ptrs[s.v->nr_blocks]),
};
}
+ case KEY_TYPE_reflink_v: {
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ return (struct bkey_ptrs_c) {
+ r.v->start,
+ bkey_val_end(r),
+ };
+ }
default:
return (struct bkey_ptrs_c) { NULL, NULL };
}
@@ -337,18 +367,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
return ret;
}
-static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
-{
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
-
- bkey_for_each_ptr(p, ptr)
- if (ptr->dev == dev)
- return ptr;
-
- return NULL;
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
@@ -359,6 +377,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *);
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -366,8 +389,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
/* bch_btree_ptr: */
const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
- struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
@@ -382,7 +404,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
/* bch_extent: */
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
enum merge_result bch2_extent_merge(struct bch_fs *,
@@ -410,8 +432,10 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
.key_merge = bch2_reservation_merge, \
}
-void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
+ struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
@@ -419,52 +443,51 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
void bch2_insert_fixup_extent(struct btree_trans *,
struct btree_insert_entry *);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
- unsigned, unsigned);
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+ unsigned, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
- struct bch_extent_ptr, u64);
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+ struct bch_extent_ptr, u64);
-static inline bool bkey_extent_is_data(const struct bkey *k)
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
{
switch (k->type) {
case KEY_TYPE_btree_ptr:
case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
return true;
default:
return false;
}
}
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+ return bkey_extent_is_direct_data(k) ||
+ k->type == KEY_TYPE_reflink_p;
+}
+
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
static inline bool bkey_extent_is_allocation(const struct bkey *k)
{
switch (k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reservation:
+ case KEY_TYPE_reflink_p:
+ case KEY_TYPE_reflink_v:
return true;
default:
return false;
}
}
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
- return bkey_extent_is_allocation(k.k) &&
- !bch2_extent_is_compressed(k);
-}
-
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
/* Extent entry iteration: */
#define extent_for_each_entry_from(_e, _entry, _start) \
@@ -480,45 +503,16 @@ void bch2_bkey_drop_device(struct bkey_s, unsigned);
#define extent_for_each_ptr(_e, _ptr) \
__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-#define extent_crc_next(_e, _crc, _iter) \
-({ \
- extent_for_each_entry_from(_e, _iter, _iter) \
- if (extent_entry_is_crc(_iter)) { \
- (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
- break; \
- } \
- \
- (_iter) < extent_entry_last(_e); \
-})
-
-#define extent_for_each_crc(_e, _crc, _iter) \
- for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
- (_iter) = (_e).v->start; \
- extent_crc_next(_e, _crc, _iter); \
- (_iter) = extent_entry_next(_iter))
-
#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
__bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
extent_entry_last(_e), _ptr, _entry)
-void bch2_extent_crc_append(struct bkey_i_extent *,
- struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-static inline void __extent_entry_push(struct bkey_i_extent *e)
-{
- union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
-
- EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
- BKEY_EXTENT_VAL_U64s_MAX);
-
- e->k.u64s += extent_entry_u64s(entry);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
struct bch_extent_crc_unpacked);
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
@@ -540,15 +534,26 @@ do { \
} \
} while (0)
-bool __bch2_cut_front(struct bpos, struct bkey_s);
+void __bch2_cut_front(struct bpos, struct bkey_s);
-static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
{
- return __bch2_cut_front(where, bkey_i_to_s(k));
+ __bch2_cut_front(where, bkey_i_to_s(k));
}
bool bch2_cut_back(struct bpos, struct bkey *);
-void bch2_key_resize(struct bkey *, unsigned);
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+ k->p.offset -= k->size;
+ k->p.offset += new_size;
+ k->size = new_size;
+}
/*
* In extent_sort_fix_overlapping(), insert_fixup_extent(),
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 1c4caa6b3a98..16b79f371853 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -16,6 +16,7 @@
#include "io.h"
#include "keylist.h"
#include "quota.h"
+#include "reflink.h"
#include <linux/aio.h>
#include <linux/backing-dev.h>
@@ -193,9 +194,9 @@ static int inode_set_size(struct bch_inode_info *inode,
return 0;
}
-static int __must_check bch2_write_inode_size(struct bch_fs *c,
- struct bch_inode_info *inode,
- loff_t new_size, unsigned fields)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ loff_t new_size, unsigned fields)
{
struct inode_new_size s = {
.new_size = new_size,
@@ -277,16 +278,16 @@ static int sum_sector_overwrites(struct btree_trans *trans,
return 0;
}
-static int bch2_extent_update(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct disk_reservation *disk_res,
- struct quota_res *quota_res,
- struct btree_iter *extent_iter,
- struct bkey_i *k,
- u64 new_i_size,
- bool may_allocate,
- bool direct,
- s64 *total_delta)
+int bch2_extent_update(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct disk_reservation *disk_res,
+ struct quota_res *quota_res,
+ struct btree_iter *extent_iter,
+ struct bkey_i *k,
+ u64 new_i_size,
+ bool may_allocate,
+ bool direct,
+ s64 *total_delta)
{
struct bch_fs *c = trans->c;
struct btree_iter *inode_iter = NULL;
@@ -298,13 +299,13 @@ static int bch2_extent_update(struct btree_trans *trans,
s64 i_sectors_delta;
int ret;
- bch2_trans_begin_updates(trans);
-
ret = bch2_btree_iter_traverse(extent_iter);
if (ret)
return ret;
- bch2_extent_trim_atomic(k, extent_iter);
+ ret = bch2_extent_trim_atomic(k, extent_iter);
+ if (ret)
+ return ret;
ret = sum_sector_overwrites(trans, extent_iter,
k, &allocating,
@@ -448,6 +449,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
bkey_copy(&tmp.k, bch2_keylist_front(keys));
+ bch2_trans_begin_updates(&trans);
+
ret = bch2_extent_update(&trans, inode,
&wop->res, quota_res,
iter, &tmp.k,
@@ -501,181 +504,272 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info
/* stored in page->private: */
-/*
- * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
- * almost protected it with the page lock, except that bch2_writepage_io_done has
- * to update the sector counts (and from interrupt/bottom half context).
- */
-struct bch_page_state {
-union { struct {
- /* existing data: */
- unsigned sectors:PAGE_SECTOR_SHIFT + 1;
-
+struct bch_page_sector {
/* Uncompressed, fully allocated replicas: */
- unsigned nr_replicas:4;
+ unsigned nr_replicas:3;
/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
- unsigned replicas_reserved:4;
-
- /* Owns PAGE_SECTORS sized quota reservation: */
- unsigned quota_reserved:1;
-
- /*
- * Number of sectors on disk - for i_blocks
- * Uncompressed size, not compressed size:
- */
- unsigned dirty_sectors:PAGE_SECTOR_SHIFT + 1;
-};
- /* for cmpxchg: */
- unsigned long v;
-};
+ unsigned replicas_reserved:3;
+
+ /* i_sectors: */
+ enum {
+ SECTOR_UNALLOCATED,
+ SECTOR_RESERVED,
+ SECTOR_DIRTY,
+ SECTOR_ALLOCATED,
+ } state:2;
};
-#define page_state_cmpxchg(_ptr, _new, _expr) \
-({ \
- unsigned long _v = READ_ONCE((_ptr)->v); \
- struct bch_page_state _old; \
- \
- do { \
- _old.v = _new.v = _v; \
- _expr; \
- \
- EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
- } while (_old.v != _new.v && \
- (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v); \
- \
- _old; \
-})
+struct bch_page_state {
+ atomic_t write_count;
+ struct bch_page_sector s[PAGE_SECTORS];
+};
-static inline struct bch_page_state *page_state(struct page *page)
+static inline struct bch_page_state *__bch2_page_state(struct page *page)
{
- struct bch_page_state *s = (void *) &page->private;
-
- BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
+ return page_has_private(page)
+ ? (struct bch_page_state *) page_private(page)
+ : NULL;
+}
- if (!PagePrivate(page))
- SetPagePrivate(page);
+static inline struct bch_page_state *bch2_page_state(struct page *page)
+{
+ EBUG_ON(!PageLocked(page));
- return s;
+ return __bch2_page_state(page);
}
-static inline unsigned page_res_sectors(struct bch_page_state s)
+/* for newly allocated pages: */
+static void __bch2_page_state_release(struct page *page)
{
+ struct bch_page_state *s = __bch2_page_state(page);
+
+ if (!s)
+ return;
- return s.replicas_reserved * PAGE_SECTORS;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ put_page(page);
+ kfree(s);
}
-static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
- struct bch_page_state s)
+static void bch2_page_state_release(struct page *page)
{
- struct disk_reservation res = { .sectors = page_res_sectors(s) };
- struct quota_res quota_res = { .sectors = s.quota_reserved ? PAGE_SECTORS : 0 };
+ struct bch_page_state *s = bch2_page_state(page);
- bch2_quota_reservation_put(c, inode, &quota_res);
- bch2_disk_reservation_put(c, &res);
+ if (!s)
+ return;
+
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ put_page(page);
+ kfree(s);
}
-static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
- struct page *page)
+/* for newly allocated pages: */
+static struct bch_page_state *__bch2_page_state_create(struct page *page,
+ gfp_t gfp)
{
- struct bch_page_state s;
-
- EBUG_ON(!PageLocked(page));
+ struct bch_page_state *s;
- s = page_state_cmpxchg(page_state(page), s, {
- s.replicas_reserved = 0;
- s.quota_reserved = 0;
- });
+ s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
+ if (!s)
+ return NULL;
- __bch2_put_page_reservation(c, inode, s);
+ /*
+ * migrate_page_move_mapping() assumes that pages with private data
+ * have their count elevated by 1.
+ */
+ get_page(page);
+ set_page_private(page, (unsigned long) s);
+ SetPagePrivate(page);
+ return s;
}
-static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
- struct page *page, bool check_enospc)
+static struct bch_page_state *bch2_page_state_create(struct page *page,
+ gfp_t gfp)
{
- struct bch_page_state *s = page_state(page), new;
+ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
+}
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
/* XXX: this should not be open coded */
- unsigned nr_replicas = inode->ei_inode.bi_data_replicas
+ return inode->ei_inode.bi_data_replicas
? inode->ei_inode.bi_data_replicas - 1
: c->opts.data_replicas;
- struct disk_reservation disk_res;
- struct quota_res quota_res = { 0 };
+}
+
+static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
+ unsigned nr_replicas)
+{
+ return max(0, (int) nr_replicas -
+ s->nr_replicas -
+ s->replicas_reserved);
+}
+
+static int bch2_get_page_disk_reservation(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct page *page, bool check_enospc)
+{
+ struct bch_page_state *s = bch2_page_state_create(page, 0);
+ unsigned nr_replicas = inode_nr_replicas(c, inode);
+ struct disk_reservation disk_res = { 0 };
+ unsigned i, disk_res_sectors = 0;
int ret;
- EBUG_ON(!PageLocked(page));
+ if (!s)
+ return -ENOMEM;
- if (s->replicas_reserved < nr_replicas) {
- ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS,
- nr_replicas - s->replicas_reserved,
- !check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (unlikely(ret))
- return ret;
+ for (i = 0; i < ARRAY_SIZE(s->s); i++)
+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+ if (!disk_res_sectors)
+ return 0;
+
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ disk_res_sectors, 1,
+ !check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL
+ : 0);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(s->s); i++)
+ s->s[i].replicas_reserved +=
+ sectors_to_reserve(&s->s[i], nr_replicas);
+
+ return 0;
+}
+
+struct bch2_page_reservation {
+ struct disk_reservation disk;
+ struct quota_res quota;
+};
+
+static void bch2_page_reservation_init(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch2_page_reservation *res)
+{
+ memset(res, 0, sizeof(*res));
+
+ res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+static void bch2_page_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch2_page_reservation *res)
+{
+ bch2_disk_reservation_put(c, &res->disk);
+ bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+static int bch2_page_reservation_get(struct bch_fs *c,
+ struct bch_inode_info *inode, struct page *page,
+ struct bch2_page_reservation *res,
+ unsigned offset, unsigned len, bool check_enospc)
+{
+ struct bch_page_state *s = bch2_page_state_create(page, 0);
+ unsigned i, disk_sectors = 0, quota_sectors = 0;
+ int ret;
+
+ if (!s)
+ return -ENOMEM;
- page_state_cmpxchg(s, new, ({
- BUG_ON(new.replicas_reserved +
- disk_res.nr_replicas != nr_replicas);
- new.replicas_reserved += disk_res.nr_replicas;
- }));
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i],
+ res->disk.nr_replicas);
+ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
}
- if (!s->quota_reserved &&
- s->sectors + s->dirty_sectors < PAGE_SECTORS) {
- ret = bch2_quota_reservation_add(c, inode, &quota_res,
- PAGE_SECTORS,
- check_enospc);
+ if (disk_sectors) {
+ ret = bch2_disk_reservation_add(c, &res->disk,
+ disk_sectors,
+ !check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL
+ : 0);
if (unlikely(ret))
return ret;
+ }
+
+ if (quota_sectors) {
+ ret = bch2_quota_reservation_add(c, inode, &res->quota,
+ quota_sectors,
+ check_enospc);
+ if (unlikely(ret)) {
+ struct disk_reservation tmp = {
+ .sectors = disk_sectors
+ };
- page_state_cmpxchg(s, new, ({
- BUG_ON(new.quota_reserved);
- new.quota_reserved = 1;
- }));
+ bch2_disk_reservation_put(c, &tmp);
+ res->disk.sectors -= disk_sectors;
+ return ret;
+ }
}
- return ret;
+ return 0;
}
static void bch2_clear_page_bits(struct page *page)
{
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_page_state s;
-
- EBUG_ON(!PageLocked(page));
+ struct bch_page_state *s = bch2_page_state(page);
+ struct disk_reservation disk_res = { 0 };
+ int i, dirty_sectors = 0;
- if (!PagePrivate(page))
+ if (!s)
return;
- s.v = xchg(&page_state(page)->v, 0);
- ClearPagePrivate(page);
+ for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+ disk_res.sectors += s->s[i].replicas_reserved;
+ s->s[i].replicas_reserved = 0;
+
+ if (s->s[i].state == SECTOR_DIRTY) {
+ dirty_sectors++;
+ s->s[i].state = SECTOR_UNALLOCATED;
+ }
+ }
+
+ bch2_disk_reservation_put(c, &disk_res);
- if (s.dirty_sectors)
- i_sectors_acct(c, inode, NULL, -s.dirty_sectors);
+ if (dirty_sectors)
+ i_sectors_acct(c, inode, NULL, -dirty_sectors);
- __bch2_put_page_reservation(c, inode, s);
+ bch2_page_state_release(page);
}
-int bch2_set_page_dirty(struct page *page)
+static void bch2_set_page_dirty(struct bch_fs *c,
+ struct bch_inode_info *inode, struct page *page,
+ struct bch2_page_reservation *res,
+ unsigned offset, unsigned len)
{
- struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct quota_res quota_res = { 0 };
- struct bch_page_state old, new;
+ struct bch_page_state *s = bch2_page_state(page);
+ unsigned i, dirty_sectors = 0;
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ unsigned sectors = sectors_to_reserve(&s->s[i],
+ res->disk.nr_replicas);
- old = page_state_cmpxchg(page_state(page), new,
- new.dirty_sectors = PAGE_SECTORS - new.sectors;
- new.quota_reserved = 0;
- );
+ BUG_ON(sectors > res->disk.sectors);
+ s->s[i].replicas_reserved += sectors;
+ res->disk.sectors -= sectors;
- quota_res.sectors += old.quota_reserved * PAGE_SECTORS;
+ if (s->s[i].state == SECTOR_UNALLOCATED)
+ dirty_sectors++;
- if (old.dirty_sectors != new.dirty_sectors)
- i_sectors_acct(c, inode, &quota_res,
- new.dirty_sectors - old.dirty_sectors);
- bch2_quota_reservation_put(c, inode, &quota_res);
+ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+ }
+
+ if (dirty_sectors)
+ i_sectors_acct(c, inode, &res->quota, dirty_sectors);
- return __set_page_dirty_nobuffers(page);
+ if (!PageDirty(page))
+ __set_page_dirty_nobuffers(page);
}
vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
@@ -685,8 +779,13 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
struct bch_inode_info *inode = file_bch_inode(file);
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_page_reservation res;
+ unsigned len;
+ loff_t isize;
int ret = VM_FAULT_LOCKED;
+ bch2_page_reservation_init(c, inode, &res);
+
sb_start_pagefault(inode->v.i_sb);
file_update_time(file);
@@ -700,26 +799,35 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
pagecache_add_get(&mapping->add_lock);
lock_page(page);
- if (page->mapping != mapping ||
- page_offset(page) > i_size_read(&inode->v)) {
+ isize = i_size_read(&inode->v);
+
+ if (page->mapping != mapping || page_offset(page) >= isize) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
goto out;
}
- if (bch2_get_page_reservation(c, inode, page, true)) {
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_SHIFT) <= isize)
+ len = PAGE_SIZE;
+ else
+ len = offset_in_page(isize);
+
+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
goto out;
}
- if (!PageDirty(page))
- set_page_dirty(page);
+ bch2_set_page_dirty(c, inode, page, &res, 0, len);
wait_for_stable_page(page);
out:
if (current->pagecache_lock != &mapping->add_lock)
pagecache_add_put(&mapping->add_lock);
sb_end_pagefault(inode->v.i_sb);
+
+ bch2_page_reservation_put(c, inode, &res);
+
return ret;
}
@@ -757,53 +865,36 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
if (PagePrivate(page)) {
- *page_state(newpage) = *page_state(page);
ClearPagePrivate(page);
+ get_page(newpage);
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ put_page(page);
+ SetPagePrivate(newpage);
}
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
#endif
-/* readpages/writepages: */
-
-static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
-{
- sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
- return bio->bi_vcnt < bio->bi_max_vecs &&
- bio_end_sector(bio) == offset;
-}
-
-static int bio_add_page_contig(struct bio *bio, struct page *page)
-{
- sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
- EBUG_ON(!bio->bi_max_vecs);
-
- if (!bio->bi_vcnt)
- bio->bi_iter.bi_sector = offset;
- else if (!bio_can_add_page_contig(bio, page))
- return -1;
-
- __bio_add_page(bio, page, PAGE_SIZE, 0);
- return 0;
-}
-
/* readpage(s): */
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
int i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -849,7 +940,8 @@ static int readpages_iter_init(struct readpages_iter *iter,
while (!list_empty(pages)) {
struct page *page = list_last_entry(pages, struct page, lru);
- prefetchw(&page->flags);
+ __bch2_page_state_create(page, __GFP_NOFAIL);
+
iter->pages[iter->nr_pages++] = page;
list_del(&page->lru);
}
@@ -885,6 +977,7 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
iter->idx++;
iter->nr_added++;
+ __bch2_page_state_release(page);
put_page(page);
}
@@ -895,7 +988,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
- page_state_init_for_read(iter->pages[iter->idx]);
return iter->pages[iter->idx];
}
@@ -903,31 +995,31 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{
struct bvec_iter iter;
struct bio_vec bv;
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+ ? 0 : bch2_bkey_nr_ptrs_allocated(k);
+ unsigned state = k.k->type == KEY_TYPE_reservation
+ ? SECTOR_RESERVED
+ : SECTOR_ALLOCATED;
bio_for_each_segment(bv, bio, iter) {
- /* brand new pages, don't need to be locked: */
-
- struct bch_page_state *s = page_state(bv.bv_page);
-
- /* sectors in @k from the start of this page: */
- unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
-
- unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
-
- s->nr_replicas = page_sectors == PAGE_SECTORS
- ? nr_ptrs : 0;
-
- BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
- s->sectors += page_sectors;
+ struct bch_page_state *s = bch2_page_state(bv.bv_page);
+ unsigned i;
+
+ for (i = bv.bv_offset >> 9;
+ i < (bv.bv_offset + bv.bv_len) >> 9;
+ i++) {
+ s->s[i].nr_replicas = nr_ptrs;
+ s->s[i].state = state;
+ }
}
}
static void readpage_bio_extend(struct readpages_iter *iter,
- struct bio *bio, u64 offset,
+ struct bio *bio,
+ unsigned sectors_this_extent,
bool get_more)
{
- while (bio_end_sector(bio) < offset &&
+ while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) {
pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
struct page *page = readpage_iter_next(iter);
@@ -942,23 +1034,23 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
if (!page)
break;
- page_state_init_for_read(page);
+ if (!__bch2_page_state_create(page, 0)) {
+ put_page(page);
+ break;
+ }
ret = add_to_page_cache_lru(page, iter->mapping,
page_offset, GFP_NOFS);
if (ret) {
- ClearPagePrivate(page);
+ __bch2_page_state_release(page);
put_page(page);
break;
}
@@ -966,7 +1058,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
put_page(page);
}
- __bio_add_page(bio, page, PAGE_SIZE, 0);
+ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
}
}
@@ -975,71 +1067,82 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
- struct bio *bio = &rbio->bio;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
+ int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
-
+retry:
while (1) {
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
- unsigned bytes;
+ unsigned bytes, sectors, offset_into_extent;
- bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
+ bch2_btree_iter_set_pos(iter,
+ POS(inum, rbio->bio.bi_iter.bi_sector));
k = bch2_btree_iter_peek_slot(iter);
- BUG_ON(!k.k);
-
- if (IS_ERR(k.k)) {
- int ret = btree_iter_err(iter);
- BUG_ON(!ret);
- bcache_io_error(c, bio, "btree IO error %i", ret);
- bio_endio(bio);
- return;
- }
+ ret = bkey_err(k);
+ if (ret)
+ break;
bkey_reassemble(&tmp.k, k);
- bch2_trans_unlock(trans);
k = bkey_i_to_s_c(&tmp.k);
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(trans,
+ &offset_into_extent, &tmp.k);
+ if (ret)
+ break;
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ bch2_trans_unlock(trans);
+
if (readpages_iter) {
bool want_full_extent = false;
if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *i;
struct extent_ptr_decoded p;
- extent_for_each_ptr_decode(e, p, i)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, i)
want_full_extent |= ((p.crc.csum_type != 0) |
(p.crc.compression_type != 0));
}
- readpage_bio_extend(readpages_iter,
- bio, k.k->p.offset,
- want_full_extent);
+ readpage_bio_extend(readpages_iter, &rbio->bio,
+ sectors, want_full_extent);
}
- bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
- bio->bi_iter.bi_sector) << 9;
- swap(bio->bi_iter.bi_size, bytes);
+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+ swap(rbio->bio.bi_iter.bi_size, bytes);
- if (bytes == bio->bi_iter.bi_size)
+ if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
if (bkey_extent_is_allocation(k.k))
- bch2_add_page_sectors(bio, k);
+ bch2_add_page_sectors(&rbio->bio, k);
- bch2_read_extent(c, rbio, k, flags);
+ bch2_read_extent(c, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
return;
- swap(bio->bi_iter.bi_size, bytes);
- bio_advance(bio, bytes);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+ bio_advance(&rbio->bio, bytes);
}
+
+ if (ret == -EINTR)
+ goto retry;
+
+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+ bio_endio(&rbio->bio);
}
int bch2_readpages(struct file *file, struct address_space *mapping,
@@ -1080,7 +1183,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
rbio->bio.bi_end_io = bch2_readpages_end_io;
- __bio_add_page(&rbio->bio, page, PAGE_SIZE, 0);
+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bchfs_read(&trans, iter, rbio, inode->v.i_ino,
&readpages_iter);
@@ -1101,10 +1204,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_trans trans;
struct btree_iter *iter;
- page_state_init_for_read(page);
+ bch2_page_state_create(page, __GFP_NOFAIL);
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
- bio_add_page_contig(&rbio->bio, page);
+ rbio->bio.bi_iter.bi_sector =
+ (sector_t) page->index << PAGE_SECTOR_SHIFT;
+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
@@ -1188,13 +1293,22 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.op.c;
struct bio *bio = &io->op.op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i;
+ unsigned i, j;
if (io->op.op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter) {
+ struct bch_page_state *s;
+
SetPageError(bvec->bv_page);
mapping_set_error(bvec->bv_page->mapping, -EIO);
+
+ lock_page(bvec->bv_page);
+ s = bch2_page_state(bvec->bv_page);
+ for (j = 0; j < PAGE_SECTORS; j++)
+ s->s[j].nr_replicas = 0;
+ unlock_page(bvec->bv_page);
}
}
@@ -1219,8 +1333,12 @@ static void bch2_writepage_io_done(struct closure *cl)
i_sectors_acct(c, io->op.inode, NULL,
io->op.sectors_added - (s64) io->new_sectors);
- bio_for_each_segment_all(bvec, bio, i)
- end_page_writeback(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter) {
+ struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
+
+ if (atomic_dec_and_test(&s->write_count))
+ end_page_writeback(bvec->bv_page);
+ }
closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
}
@@ -1241,11 +1359,10 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
static void bch2_writepage_io_alloc(struct bch_fs *c,
struct bch_writepage_state *w,
struct bch_inode_info *inode,
- struct page *page,
+ u64 sector,
unsigned nr_replicas)
{
struct bch_write_op *op;
- u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
BIO_MAX_PAGES,
@@ -1259,8 +1376,8 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->pos = POS(inode->v.i_ino, offset);
- op->wbio.bio.bi_iter.bi_sector = offset;
+ op->pos = POS(inode->v.i_ino, sector);
+ op->wbio.bio.bi_iter.bi_sector = sector;
}
static int __bch2_writepage(struct page *page,
@@ -1270,10 +1387,11 @@ static int __bch2_writepage(struct page *page,
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_writepage_state *w = data;
- struct bch_page_state new, old;
- unsigned offset, nr_replicas_this_write;
+ struct bch_page_state *s, orig;
+ unsigned i, offset, nr_replicas_this_write = U32_MAX;
loff_t i_size = i_size_read(&inode->v);
pgoff_t end_index = i_size >> PAGE_SHIFT;
+ int ret;
EBUG_ON(!PageUptodate(page));
@@ -1297,57 +1415,104 @@ static int __bch2_writepage(struct page *page,
*/
zero_user_segment(page, offset, PAGE_SIZE);
do_io:
- EBUG_ON(!PageLocked(page));
+ s = bch2_page_state_create(page, __GFP_NOFAIL);
+
+ ret = bch2_get_page_disk_reservation(c, inode, page, true);
+ if (ret) {
+ SetPageError(page);
+ mapping_set_error(page->mapping, ret);
+ unlock_page(page);
+ return 0;
+ }
+
+ /* Before unlocking the page, get copy of reservations: */
+ orig = *s;
+
+ for (i = 0; i < PAGE_SECTORS; i++) {
+ if (s->s[i].state < SECTOR_DIRTY)
+ continue;
- /* Before unlocking the page, transfer reservation to w->io: */
- old = page_state_cmpxchg(page_state(page), new, {
- /*
- * If we didn't get a reservation, we can only write out the
- * number of (fully allocated) replicas that currently exist,
- * and only if the entire page has been written:
- */
nr_replicas_this_write =
- max_t(unsigned,
- new.replicas_reserved,
- (new.sectors == PAGE_SECTORS
- ? new.nr_replicas : 0));
+ min_t(unsigned, nr_replicas_this_write,
+ s->s[i].nr_replicas +
+ s->s[i].replicas_reserved);
+ }
- BUG_ON(!nr_replicas_this_write);
+ for (i = 0; i < PAGE_SECTORS; i++) {
+ if (s->s[i].state < SECTOR_DIRTY)
+ continue;
- new.nr_replicas = w->opts.compression
- ? 0
- : nr_replicas_this_write;
+ s->s[i].nr_replicas = w->opts.compression
+ ? 0 : nr_replicas_this_write;
- new.replicas_reserved = 0;
+ s->s[i].replicas_reserved = 0;
+ s->s[i].state = SECTOR_ALLOCATED;
+ }
- new.sectors += new.dirty_sectors;
- BUG_ON(new.sectors != PAGE_SECTORS);
- new.dirty_sectors = 0;
- });
+ BUG_ON(atomic_read(&s->write_count));
+ atomic_set(&s->write_count, 1);
BUG_ON(PageWriteback(page));
set_page_writeback(page);
+
unlock_page(page);
- if (w->io &&
- (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
- !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
- bch2_writepage_do_io(w);
+ offset = 0;
+ while (1) {
+ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+ u64 sector;
+
+ while (offset < PAGE_SECTORS &&
+ orig.s[offset].state < SECTOR_DIRTY)
+ offset++;
- if (!w->io)
- bch2_writepage_io_alloc(c, w, inode, page,
- nr_replicas_this_write);
+ if (offset == PAGE_SECTORS)
+ break;
+
+ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
- w->io->new_sectors += new.sectors - old.sectors;
+ while (offset + sectors < PAGE_SECTORS &&
+ orig.s[offset + sectors].state >= SECTOR_DIRTY)
+ sectors++;
- BUG_ON(inode != w->io->op.inode);
- BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+ for (i = offset; i < offset + sectors; i++) {
+ reserved_sectors += orig.s[i].replicas_reserved;
+ dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
+ }
- w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS;
- w->io->op.new_i_size = i_size;
+ if (w->io &&
+ (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
+ bio_full(&w->io->op.op.wbio.bio) ||
+ bio_end_sector(&w->io->op.op.wbio.bio) != sector))
+ bch2_writepage_do_io(w);
- if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+ if (!w->io)
+ bch2_writepage_io_alloc(c, w, inode, sector,
+ nr_replicas_this_write);
+
+ w->io->new_sectors += dirty_sectors;
+
+ atomic_inc(&s->write_count);
+
+ BUG_ON(inode != w->io->op.inode);
+ BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
+ sectors << 9, offset << 9));
+
+ /* Check for writing past i_size: */
+ BUG_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) >
+ round_up(i_size, block_bytes(c)));
+
+ w->io->op.op.res.sectors += reserved_sectors;
+ w->io->op.new_i_size = i_size;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
+ offset += sectors;
+ }
+
+ if (atomic_dec_and_test(&s->write_count))
+ end_page_writeback(page);
return 0;
}
@@ -1390,12 +1555,18 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_page_reservation *res;
pgoff_t index = pos >> PAGE_SHIFT;
unsigned offset = pos & (PAGE_SIZE - 1);
struct page *page;
int ret = -ENOMEM;
- BUG_ON(inode_unhashed(&inode->v));
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ bch2_page_reservation_init(c, inode, res);
+ *fsdata = res;
/* Not strictly necessary - same reason as mkwrite(): */
pagecache_add_get(&mapping->add_lock);
@@ -1427,7 +1598,8 @@ readpage:
if (ret)
goto err;
out:
- ret = bch2_get_page_reservation(c, inode, page, true);
+ ret = bch2_page_reservation_get(c, inode, page, res,
+ offset, len, true);
if (ret) {
if (!PageUptodate(page)) {
/*
@@ -1450,6 +1622,8 @@ err:
*pagep = NULL;
err_unlock:
pagecache_add_put(&mapping->add_lock);
+ kfree(res);
+ *fsdata = NULL;
return ret;
}
@@ -1459,6 +1633,8 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_page_reservation *res = fsdata;
+ unsigned offset = pos & (PAGE_SIZE - 1);
lockdep_assert_held(&inode->v.i_rwsem);
@@ -1481,18 +1657,19 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
if (copied) {
if (!PageUptodate(page))
SetPageUptodate(page);
- if (!PageDirty(page))
- set_page_dirty(page);
+
+ bch2_set_page_dirty(c, inode, page, res, offset, copied);
inode->ei_last_dirtied = (unsigned long) current;
- } else {
- bch2_put_page_reservation(c, inode, page);
}
unlock_page(page);
put_page(page);
pagecache_add_put(&mapping->add_lock);
+ bch2_page_reservation_put(c, inode, res);
+ kfree(res);
+
return copied;
}
@@ -1505,15 +1682,19 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct page *pages[WRITE_BATCH_PAGES];
+ struct bch2_page_reservation res;
unsigned long index = pos >> PAGE_SHIFT;
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
- unsigned i, copied = 0, nr_pages_copied = 0;
+ unsigned i, reserved = 0, set_dirty = 0;
+ unsigned copied = 0, nr_pages_copied = 0;
int ret = 0;
BUG_ON(!len);
BUG_ON(nr_pages > ARRAY_SIZE(pages));
+ bch2_page_reservation_init(c, inode, &res);
+
for (i = 0; i < nr_pages; i++) {
pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
if (!pages[i]) {
@@ -1540,19 +1721,25 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
}
}
- for (i = 0; i < nr_pages; i++) {
- ret = bch2_get_page_reservation(c, inode, pages[i], true);
-
- if (ret && !PageUptodate(pages[i])) {
- ret = bch2_read_single_page(pages[i], mapping);
- if (ret)
- goto out;
-
- ret = bch2_get_page_reservation(c, inode, pages[i], true);
+ while (reserved < len) {
+ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
+ unsigned pg_len = min_t(unsigned, len - reserved,
+ PAGE_SIZE - pg_offset);
+retry_reservation:
+ ret = bch2_page_reservation_get(c, inode, page, &res,
+ pg_offset, pg_len, true);
+
+ if (ret && !PageUptodate(page)) {
+ ret = bch2_read_single_page(page, mapping);
+ if (!ret)
+ goto retry_reservation;
}
if (ret)
goto out;
+
+ reserved += pg_len;
}
if (mapping_writably_mapped(mapping))
@@ -1562,10 +1749,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
while (copied < len) {
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
- unsigned pg_bytes = min_t(unsigned, len - copied,
- PAGE_SIZE - pg_offset);
+ unsigned pg_len = min_t(unsigned, len - copied,
+ PAGE_SIZE - pg_offset);
unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
- iter, pg_offset, pg_bytes);
+ iter, pg_offset, pg_len);
if (!pg_copied)
break;
@@ -1595,23 +1782,30 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
copied -= (offset + copied) & (PAGE_SIZE - 1);
}
}
-out:
- for (i = 0; i < nr_pages_copied; i++) {
- if (!PageUptodate(pages[i]))
- SetPageUptodate(pages[i]);
- if (!PageDirty(pages[i]))
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ while (set_dirty < copied) {
+ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
+ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
+ unsigned pg_len = min_t(unsigned, copied - set_dirty,
+ PAGE_SIZE - pg_offset);
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
+ unlock_page(page);
+ put_page(page);
+
+ set_dirty += pg_len;
+ }
+out:
for (i = nr_pages_copied; i < nr_pages; i++) {
- if (!PageDirty(pages[i]))
- bch2_put_page_reservation(c, inode, pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
}
+ bch2_page_reservation_put(c, inode, &res);
+
return copied ?: ret;
}
@@ -1816,6 +2010,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct address_space *mapping = req->ki_filp->f_mapping;
struct bch_inode_info *inode = dio->iop.inode;
struct bio *bio = &dio->iop.op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
loff_t offset;
bool sync;
@@ -1893,7 +2088,7 @@ err_wait_io:
closure_sync(&dio->cl);
loop:
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, i, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->iop.op.error)
break;
@@ -2093,29 +2288,25 @@ out:
/* truncate: */
-static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
- u64 start_offset, u64 end_offset, u64 *journal_seq)
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, struct bch_inode_info *inode,
+ u64 new_i_size)
{
- struct bpos start = POS(inode->v.i_ino, start_offset);
- struct bpos end = POS(inode->v.i_ino, end_offset);
+ struct bch_fs *c = trans->c;
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct btree_trans trans;
- struct btree_iter *iter;
struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
- BTREE_ITER_INTENT);
+ int ret = 0, ret2 = 0;
while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k)) &&
bkey_cmp(iter->pos, end) < 0) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
+ ret = bkey_err(k);
+ if (ret)
+ goto btree_err;
+
bkey_init(&delete.k);
delete.k.p = iter->pos;
@@ -2123,21 +2314,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k);
- ret = bch2_extent_update(&trans, inode,
+ bch2_trans_begin_updates(trans);
+
+ ret = bch2_extent_update(trans, inode,
&disk_res, NULL, iter, &delete,
- 0, true, true, NULL);
+ new_i_size, false, true, NULL);
bch2_disk_reservation_put(c, &disk_res);
-
- if (ret == -EINTR)
+btree_err:
+ if (ret == -EINTR) {
+ ret2 = ret;
ret = 0;
+ }
if (ret)
break;
+ }
- bch2_trans_cond_resched(&trans);
+ if (bkey_cmp(iter->pos, end) > 0) {
+ bch2_btree_iter_set_pos(iter, end);
+ ret = bch2_btree_iter_traverse(iter);
}
+ return ret ?: ret2;
+}
+
+static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
+ u64 start_offset, u64 end_offset)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(inode->v.i_ino, start_offset),
+ BTREE_ITER_INTENT);
+
+ ret = bch2_fpunch_at(&trans, iter,
+ POS(inode->v.i_ino, end_offset),
+ inode, 0);
+
bch2_trans_exit(&trans);
+ if (ret == -EINTR)
+ ret = 0;
+
return ret;
}
@@ -2170,8 +2391,10 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
+ struct bch_page_state *s;
unsigned start_offset = start & (PAGE_SIZE - 1);
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+ unsigned i;
struct page *page;
int ret = 0;
@@ -2203,31 +2426,42 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
}
}
+ s = bch2_page_state_create(page, 0);
+ if (!s) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
if (!PageUptodate(page)) {
ret = bch2_read_single_page(page, mapping);
if (ret)
goto unlock;
}
+ if (index != start >> PAGE_SHIFT)
+ start_offset = 0;
+ if (index != end >> PAGE_SHIFT)
+ end_offset = PAGE_SIZE;
+
+ for (i = round_up(start_offset, block_bytes(c)) >> 9;
+ i < round_down(end_offset, block_bytes(c)) >> 9;
+ i++) {
+ s->s[i].nr_replicas = 0;
+ s->s[i].state = SECTOR_UNALLOCATED;
+ }
+
+ zero_user_segment(page, start_offset, end_offset);
+
/*
* Bit of a hack - we don't want truncate to fail due to -ENOSPC.
*
* XXX: because we aren't currently tracking whether the page has actual
* data in it (vs. just 0s, or only partially written) this wrong. ick.
*/
- ret = bch2_get_page_reservation(c, inode, page, false);
+ ret = bch2_get_page_disk_reservation(c, inode, page, false);
BUG_ON(ret);
- if (index == start >> PAGE_SHIFT &&
- index == end >> PAGE_SHIFT)
- zero_user_segment(page, start_offset, end_offset);
- else if (index == start >> PAGE_SHIFT)
- zero_user_segment(page, start_offset, PAGE_SIZE);
- else if (index == end >> PAGE_SHIFT)
- zero_user_segment(page, 0, end_offset);
-
- if (!PageDirty(page))
- set_page_dirty(page);
+ __set_page_dirty_nobuffers(page);
unlock:
unlock_page(page);
put_page(page);
@@ -2238,7 +2472,7 @@ out:
static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
{
return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
- from, from + PAGE_SIZE);
+ from, round_up(from, PAGE_SIZE));
}
static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
@@ -2308,6 +2542,16 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
if (unlikely(ret))
goto err;
+ /*
+ * When extending, we're going to write the new i_size to disk
+ * immediately so we need to flush anything above the current on disk
+ * i_size first:
+ *
+ * Also, when extending we need to flush the page that i_size currently
+ * straddles - if it's mapped to userspace, we need to ensure that
+ * userspace has to redirty it and call .mkwrite -> set_page_dirty
+ * again to allocate the part of the page that was extended.
+ */
if (iattr->ia_size > inode->ei_inode.bi_size)
ret = filemap_write_and_wait_range(mapping,
inode->ei_inode.bi_size,
@@ -2329,13 +2573,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
truncate_setsize(&inode->v, iattr->ia_size);
- /*
- * XXX: need a comment explaining why PAGE_SIZE and not block_bytes()
- * here:
- */
ret = __bch2_fpunch(c, inode,
- round_up(iattr->ia_size, PAGE_SIZE) >> 9,
- U64_MAX, &inode->ei_journal_seq);
+ round_up(iattr->ia_size, block_bytes(c)) >> 9,
+ U64_MAX);
if (unlikely(ret))
goto err;
@@ -2356,8 +2596,8 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
- u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+ u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
+ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
int ret = 0;
inode_lock(&inode->v);
@@ -2382,8 +2622,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
truncate_pagecache_range(&inode->v, offset, offset + len - 1);
if (discard_start < discard_end)
- ret = __bch2_fpunch(c, inode, discard_start, discard_end,
- &inode->ei_journal_seq);
+ ret = __bch2_fpunch(c, inode, discard_start, discard_end);
err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
@@ -2391,16 +2630,16 @@ err:
return ret;
}
-static long bch2_fcollapse(struct bch_inode_info *inode,
- loff_t offset, loff_t len)
+static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
+ loff_t offset, loff_t len,
+ bool insert)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
struct btree_trans trans;
- struct btree_iter *src, *dst;
- BKEY_PADDED(k) copy;
- struct bkey_s_c k;
- loff_t new_size;
+ struct btree_iter *src, *dst, *del = NULL;
+ loff_t shift, new_size;
+ u64 src_start;
int ret;
if ((offset | len) & (block_bytes(c) - 1))
@@ -2418,88 +2657,188 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
- ret = -EINVAL;
- if (offset + len >= inode->v.i_size)
- goto err;
+ if (insert) {
+ ret = -EFBIG;
+ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
+ goto err;
- if (inode->v.i_size < len)
- goto err;
+ ret = -EINVAL;
+ if (offset >= inode->v.i_size)
+ goto err;
+
+ src_start = U64_MAX;
+ shift = len;
+ } else {
+ ret = -EINVAL;
+ if (offset + len >= inode->v.i_size)
+ goto err;
+
+ src_start = offset + len;
+ shift = -len;
+ }
- new_size = inode->v.i_size - len;
+ new_size = inode->v.i_size + shift;
ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
if (ret)
goto err;
- dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS(inode->v.i_ino, offset >> 9),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- BUG_ON(IS_ERR_OR_NULL(dst));
+ if (insert) {
+ i_size_write(&inode->v, new_size);
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_write_inode_size(c, inode, new_size,
+ ATTR_MTIME|ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+ } else {
+ ret = __bch2_fpunch(c, inode, offset >> 9,
+ (offset + len) >> 9);
+ if (ret)
+ goto err;
+ }
src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS_MIN, BTREE_ITER_SLOTS);
+ POS(inode->v.i_ino, src_start >> 9),
+ BTREE_ITER_INTENT);
BUG_ON(IS_ERR_OR_NULL(src));
- while (bkey_cmp(dst->pos,
- POS(inode->v.i_ino,
- round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
- struct disk_reservation disk_res;
+ dst = bch2_trans_copy_iter(&trans, src);
+ BUG_ON(IS_ERR_OR_NULL(dst));
+
+ while (1) {
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+ BKEY_PADDED(k) copy;
+ struct bkey_i delete;
+ struct bkey_s_c k;
+ struct bpos next_pos;
+ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
+ struct bpos atomic_end;
+ unsigned commit_flags = BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_USE_RESERVE;
+
+ k = insert
+ ? bch2_btree_iter_peek_prev(src)
+ : bch2_btree_iter_peek(src);
+ if ((ret = bkey_err(k)))
+ goto bkey_err;
+
+ if (!k.k || k.k->p.inode != inode->v.i_ino)
+ break;
+
+ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
+
+ if (insert &&
+ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
+ break;
+reassemble:
+ bkey_reassemble(&copy.k, k);
+
+ if (insert &&
+ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
+ bch2_cut_front(move_pos, &copy.k);
+ bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k.k));
+ }
+
+ copy.k.k.p.offset += shift >> 9;
+ bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
ret = bch2_btree_iter_traverse(dst);
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(src,
- POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
-
- k = bch2_btree_iter_peek_slot(src);
- if ((ret = bkey_err(k)))
+ ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
+ if (ret)
goto bkey_err;
- bkey_reassemble(&copy.k, k);
+ if (bkey_cmp(atomic_end, copy.k.k.p)) {
+ if (insert) {
+ move_pos = atomic_end;
+ move_pos.offset -= shift >> 9;
+ goto reassemble;
+ } else {
+ bch2_cut_back(atomic_end, &copy.k.k);
+ }
+ }
- bch2_cut_front(src->pos, &copy.k);
- copy.k.k.p.offset -= len >> 9;
+ bkey_init(&delete.k);
+ delete.k.p = src->pos;
+ bch2_key_resize(&delete.k, copy.k.k.size);
+
+ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
- bch2_extent_trim_atomic(&copy.k, dst);
+ /*
+ * If the new and old keys overlap (because we're moving an
+ * extent that's bigger than the amount we're collapsing by),
+ * we need to trim the delete key here so they don't overlap
+ * because overlaps on insertions aren't handled before
+ * triggers are run, so the overwrite will get double counted
+ * by the triggers machinery:
+ */
+ if (insert &&
+ bkey_cmp(bkey_start_pos(&copy.k.k), delete.k.p) < 0) {
+ bch2_cut_back(bkey_start_pos(&copy.k.k), &delete.k);
+ } else if (!insert &&
+ bkey_cmp(copy.k.k.p,
+ bkey_start_pos(&delete.k)) > 0) {
+ bch2_cut_front(copy.k.k.p, &delete);
+
+ del = bch2_trans_copy_iter(&trans, src);
+ BUG_ON(IS_ERR_OR_NULL(del));
+
+ bch2_btree_iter_set_pos(del,
+ bkey_start_pos(&delete.k));
+ }
- BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
+ bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, &copy.k));
+ bch2_trans_update(&trans,
+ BTREE_INSERT_ENTRY(del ?: src, &delete));
- ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
+ if (copy.k.k.size == k.k->size) {
+ /*
+ * If we're moving the entire extent, we can skip
+ * running triggers:
+ */
+ commit_flags |= BTREE_INSERT_NOMARK;
+ } else {
+ /* We might end up splitting compressed extents: */
+ unsigned nr_ptrs =
+ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k));
+
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ copy.k.k.size, nr_ptrs,
+ BCH_DISK_RESERVATION_NOFAIL);
+ BUG_ON(ret);
+ }
- ret = bch2_extent_update(&trans, inode,
- &disk_res, NULL,
- dst, &copy.k,
- 0, true, true, NULL);
+ ret = bch2_trans_commit(&trans, &disk_res,
+ &inode->ei_journal_seq,
+ commit_flags);
bch2_disk_reservation_put(c, &disk_res);
bkey_err:
+ if (del)
+ bch2_trans_iter_free(&trans, del);
+ del = NULL;
+
+ if (!ret)
+ bch2_btree_iter_set_pos(src, next_pos);
+
if (ret == -EINTR)
ret = 0;
if (ret)
goto err;
- /*
- * XXX: if we error here we've left data with multiple
- * pointers... which isn't a _super_ serious problem...
- */
bch2_trans_cond_resched(&trans);
}
bch2_trans_unlock(&trans);
- ret = __bch2_fpunch(c, inode,
- round_up(new_size, block_bytes(c)) >> 9,
- U64_MAX, &inode->ei_journal_seq);
- if (ret)
- goto err;
-
- i_size_write(&inode->v, new_size);
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode, new_size,
- ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
+ if (!insert) {
+ i_size_write(&inode->v, new_size);
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_write_inode_size(c, inode, new_size,
+ ATTR_MTIME|ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+ }
err:
bch2_trans_exit(&trans);
pagecache_block_put(&mapping->add_lock);
@@ -2515,8 +2854,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
struct btree_trans trans;
struct btree_iter *iter;
struct bpos end_pos;
- loff_t block_start, block_end;
- loff_t end = offset + len;
+ loff_t end = offset + len;
+ loff_t block_start = round_down(offset, block_bytes(c));
+ loff_t block_end = round_up(end, block_bytes(c));
unsigned sectors;
unsigned replicas = io_opts(c, inode).data_replicas;
int ret;
@@ -2548,12 +2888,6 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
goto err;
truncate_pagecache_range(&inode->v, offset, end - 1);
-
- block_start = round_up(offset, PAGE_SIZE);
- block_end = round_down(end, PAGE_SIZE);
- } else {
- block_start = round_down(offset, PAGE_SIZE);
- block_end = round_up(end, PAGE_SIZE);
}
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -2613,6 +2947,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = disk_res.nr_replicas;
}
+ bch2_trans_begin_updates(&trans);
+
ret = bch2_extent_update(&trans, inode,
&disk_res, &quota_res,
iter, &reservation.k_i,
@@ -2671,43 +3007,157 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
return bch2_fpunch(inode, offset, len);
+ if (mode == FALLOC_FL_INSERT_RANGE)
+ return bch2_fcollapse_finsert(inode, offset, len, true);
+
if (mode == FALLOC_FL_COLLAPSE_RANGE)
- return bch2_fcollapse(inode, offset, len);
+ return bch2_fcollapse_finsert(inode, offset, len, false);
return -EOPNOTSUPP;
}
+static void mark_range_unallocated(struct bch_inode_info *inode,
+ loff_t start, loff_t end)
+{
+ pgoff_t index = start >> PAGE_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
+ struct pagevec pvec;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct bch_page_state *s;
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s)
+ for (j = 0; j < PAGE_SECTORS; j++)
+ s->s[j].nr_replicas = 0;
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+ struct file *file_dst, loff_t pos_dst,
+ loff_t len, unsigned remap_flags)
+{
+ struct bch_inode_info *src = file_bch_inode(file_src);
+ struct bch_inode_info *dst = file_bch_inode(file_dst);
+ struct bch_fs *c = src->v.i_sb->s_fs_info;
+ loff_t ret = 0;
+ loff_t aligned_len;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+
+ if ((pos_src & (block_bytes(c) - 1)) ||
+ (pos_dst & (block_bytes(c) - 1)))
+ return -EINVAL;
+
+ if (src == dst &&
+ abs(pos_src - pos_dst) < len)
+ return -EINVAL;
+
+ bch2_lock_inodes(INODE_LOCK, src, dst);
+
+ inode_dio_wait(&src->v);
+ inode_dio_wait(&dst->v);
+
+ __pagecache_block_get(&src->v.i_mapping->add_lock);
+ __pagecache_block_get(&dst->v.i_mapping->add_lock);
+
+ ret = generic_remap_file_range_prep(file_src, pos_src,
+ file_dst, pos_dst,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ aligned_len = round_up(len, block_bytes(c));
+
+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+ pos_dst, pos_dst + aligned_len);
+ if (ret)
+ goto out_unlock;
+
+ mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+
+ ret = bch2_remap_range(c, dst,
+ POS(dst->v.i_ino, pos_dst >> 9),
+ POS(src->v.i_ino, pos_src >> 9),
+ aligned_len >> 9,
+ pos_dst + len);
+ if (ret > 0)
+ ret = min(ret << 9, len);
+
+out_unlock:
+ __pagecache_block_put(&dst->v.i_mapping->add_lock);
+ __pagecache_block_put(&src->v.i_mapping->add_lock);
+
+ bch2_unlock_inodes(INODE_LOCK, src, dst);
+
+ return ret;
+}
+
/* fseek: */
-static bool page_is_data(struct page *page)
+static int page_data_offset(struct page *page, unsigned offset)
{
- EBUG_ON(!PageLocked(page));
+ struct bch_page_state *s = bch2_page_state(page);
+ unsigned i;
+
+ if (s)
+ for (i = offset >> 9; i < PAGE_SECTORS; i++)
+ if (s->s[i].state >= SECTOR_DIRTY)
+ return i << 9;
- /* XXX: should only have to check PageDirty */
- return PagePrivate(page) &&
- (page_state(page)->sectors ||
- page_state(page)->dirty_sectors);
+ return -1;
}
-static loff_t bch2_next_pagecache_data(struct inode *vinode,
+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
loff_t start_offset,
loff_t end_offset)
{
struct address_space *mapping = vinode->i_mapping;
struct page *page;
- pgoff_t index;
-
- for (index = start_offset >> PAGE_SHIFT;
- index < end_offset >> PAGE_SHIFT;
- index++) {
- if (find_get_pages(mapping, &index, 1, &page)) {
+ pgoff_t start_index = start_offset >> PAGE_SHIFT;
+ pgoff_t end_index = end_offset >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ loff_t ret;
+ int offset;
+
+ while (index <= end_index) {
+ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
lock_page(page);
- if (page_is_data(page))
- end_offset =
- min(end_offset,
- max(start_offset,
- ((loff_t) index) << PAGE_SHIFT));
+ offset = page_data_offset(page,
+ page->index == start_index
+ ? start_offset & (PAGE_SIZE - 1)
+ : 0);
+ if (offset >= 0) {
+ ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+ offset,
+ start_offset, end_offset);
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+
unlock_page(page);
put_page(page);
} else {
@@ -2750,43 +3200,65 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
return ret;
if (next_data > offset)
- next_data = bch2_next_pagecache_data(&inode->v,
+ next_data = bch2_seek_pagecache_data(&inode->v,
offset, next_data);
- if (next_data > isize)
+ if (next_data >= isize)
return -ENXIO;
return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
}
-static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+static int __page_hole_offset(struct page *page, unsigned offset)
{
+ struct bch_page_state *s = bch2_page_state(page);
+ unsigned i;
+
+ if (!s)
+ return 0;
+
+ for (i = offset >> 9; i < PAGE_SECTORS; i++)
+ if (s->s[i].state < SECTOR_DIRTY)
+ return i << 9;
+
+ return -1;
+}
+
+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
struct page *page;
- bool ret;
+ int pg_offset;
+ loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
- return false;
+ if (!page || xa_is_value(page))
+ return offset;
+
+ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
+ if (pg_offset >= 0)
+ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
- ret = page_is_data(page);
unlock_page(page);
return ret;
}
-static loff_t bch2_next_pagecache_hole(struct inode *vinode,
+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
loff_t start_offset,
loff_t end_offset)
{
struct address_space *mapping = vinode->i_mapping;
- pgoff_t index;
+ loff_t offset = start_offset, hole;
- for (index = start_offset >> PAGE_SHIFT;
- index < end_offset >> PAGE_SHIFT;
- index++)
- if (!page_slot_is_data(mapping, index))
- end_offset = max(start_offset,
- ((loff_t) index) << PAGE_SHIFT);
+ while (offset < end_offset) {
+ hole = page_hole_offset(mapping, offset);
+ if (hole >= 0 && hole <= end_offset)
+ return max(start_offset, hole);
+
+ offset += PAGE_SIZE;
+ offset &= PAGE_MASK;
+ }
return end_offset;
}
@@ -2811,11 +3283,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
POS(inode->v.i_ino, offset >> 9),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_next_pagecache_hole(&inode->v,
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE);
break;
} else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_next_pagecache_hole(&inode->v,
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
max(offset, bkey_start_offset(k.k) << 9),
k.k->p.offset << 9);
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 88060b8785c3..a35732327e91 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -9,7 +9,21 @@
#include <linux/uio.h>
-int bch2_set_page_dirty(struct page *);
+struct quota_res;
+
+int bch2_extent_update(struct btree_trans *,
+ struct bch_inode_info *,
+ struct disk_reservation *,
+ struct quota_res *,
+ struct btree_iter *,
+ struct bkey_i *,
+ u64, bool, bool, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+ struct bpos, struct bch_inode_info *, u64);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+ struct bch_inode_info *,
+ loff_t, unsigned);
int bch2_writepage(struct page *, struct writeback_control *);
int bch2_readpage(struct file *, struct page *);
@@ -32,6 +46,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+ loff_t, loff_t, unsigned);
+
loff_t bch2_llseek(struct file *, loff_t, int);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 615b0be8b468..16017079157f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1068,16 +1068,20 @@ static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
return 0;
}
-static int bch2_fill_extent(struct fiemap_extent_info *info,
- const struct bkey_i *k, unsigned flags)
+static int bch2_fill_extent(struct bch_fs *c,
+ struct fiemap_extent_info *info,
+ struct bkey_s_c k, unsigned flags)
{
- if (bkey_extent_is_data(&k->k)) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+ if (bkey_extent_is_data(k.k)) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
int ret;
- extent_for_each_ptr_decode(e, p, entry) {
+ if (k.k->type == KEY_TYPE_reflink_v)
+ flags |= FIEMAP_EXTENT_SHARED;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int flags2 = 0;
u64 offset = p.ptr.offset;
@@ -1086,23 +1090,23 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
else
offset += p.crc.offset;
- if ((offset & (PAGE_SECTORS - 1)) ||
- (e.k->size & (PAGE_SECTORS - 1)))
+ if ((offset & (c->opts.block_size - 1)) ||
+ (k.k->size & (c->opts.block_size - 1)))
flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
ret = fiemap_fill_next_extent(info,
- bkey_start_offset(e.k) << 9,
+ bkey_start_offset(k.k) << 9,
offset << 9,
- e.k->size << 9, flags|flags2);
+ k.k->size << 9, flags|flags2);
if (ret)
return ret;
}
return 0;
- } else if (k->k.type == KEY_TYPE_reservation) {
+ } else if (k.k->type == KEY_TYPE_reservation) {
return fiemap_fill_next_extent(info,
- bkey_start_offset(&k->k) << 9,
- 0, k->k.size << 9,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
flags|
FIEMAP_EXTENT_DELALLOC|
FIEMAP_EXTENT_UNWRITTEN);
@@ -1119,7 +1123,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- BKEY_PADDED(k) tmp;
+ BKEY_PADDED(k) cur, prev;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
+ unsigned offset_into_extent, sectors;
bool have_extent = false;
int ret = 0;
@@ -1128,26 +1134,63 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(ei->v.i_ino, start >> 9), 0, k, ret)
- if (bkey_extent_is_data(k.k) ||
- k.k->type == KEY_TYPE_reservation) {
- if (bkey_cmp(bkey_start_pos(k.k),
- POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
- break;
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(ei->v.i_ino, start >> 9), 0);
+retry:
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k)) &&
+ bkey_cmp(iter->pos, end) < 0) {
+ if (!bkey_extent_is_data(k.k) &&
+ k.k->type != KEY_TYPE_reservation) {
+ bch2_btree_iter_next(iter);
+ continue;
+ }
- if (have_extent) {
- ret = bch2_fill_extent(info, &tmp.k, 0);
- if (ret)
- break;
- }
+ bkey_reassemble(&cur.k, k);
+ k = bkey_i_to_s_c(&cur.k);
+
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
- bkey_reassemble(&tmp.k, k);
- have_extent = true;
+ ret = bch2_read_indirect_extent(&trans,
+ &offset_into_extent, &cur.k);
+ if (ret)
+ break;
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ if (offset_into_extent)
+ bch2_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) +
+ offset_into_extent),
+ &cur.k);
+ bch2_key_resize(&cur.k.k, sectors);
+ cur.k.k.p = iter->pos;
+ cur.k.k.p.offset += cur.k.k.size;
+
+ if (have_extent) {
+ ret = bch2_fill_extent(c, info,
+ bkey_i_to_s_c(&prev.k), 0);
+ if (ret)
+ break;
}
+ bkey_copy(&prev.k, &cur.k);
+ have_extent = true;
+
+ if (k.k->type == KEY_TYPE_reflink_v)
+ bch2_btree_iter_set_pos(iter, k.k->p);
+ else
+ bch2_btree_iter_next(iter);
+ }
+
+ if (ret == -EINTR)
+ goto retry;
+
if (!ret && have_extent)
- ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+ FIEMAP_EXTENT_LAST);
ret = bch2_trans_exit(&trans) ?: ret;
return ret < 0 ? ret : 0;
@@ -1196,6 +1239,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1266,7 +1310,7 @@ static const struct address_space_operations bch_address_space_operations = {
.readpage = bch2_readpage,
.writepages = bch2_writepages,
.readpages = bch2_readpages,
- .set_page_dirty = bch2_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
.invalidatepage = bch2_invalidatepage,
@@ -1412,12 +1456,6 @@ static int bch2_vfs_write_inode(struct inode *vinode,
ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
- if (c->opts.journal_flush_disabled)
- return ret;
-
- if (!ret && wbc->sync_mode == WB_SYNC_ALL)
- ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
-
return ret;
}
@@ -1474,6 +1512,9 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
{
struct bch_fs *c = sb->s_fs_info;
+ if (c->opts.journal_flush_disabled)
+ return 0;
+
if (!wait) {
bch2_journal_flush_async(&c->journal, NULL);
return 0;
@@ -1712,9 +1753,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
goto out;
}
- /* XXX: blocksize */
- sb->s_blocksize = PAGE_SIZE;
- sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_blocksize = block_bytes(c);
+ sb->s_blocksize_bits = ilog2(block_bytes(c));
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &bch_super_operations;
sb->s_export_op = &bch_export_ops;
@@ -1734,7 +1774,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e3738757b6a0..50a7d8c1faba 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -509,7 +509,7 @@ retry:
if (fsck_err_on(w.have_inode &&
!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
+ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
"extent type %u offset %llu past end of inode %llu, i_size %llu",
k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
bch2_trans_unlock(&trans);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 841261b79f43..a9eda1b92b01 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -122,23 +122,23 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, i, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
}
-static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
- bool *using_mempool)
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
{
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
+ struct page *page;
if (likely(!*using_mempool)) {
- bv->bv_page = alloc_page(GFP_NOIO);
- if (unlikely(!bv->bv_page)) {
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page)) {
mutex_lock(&c->bio_bounce_pages_lock);
*using_mempool = true;
goto pool_alloc;
@@ -146,57 +146,29 @@ static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
}
} else {
pool_alloc:
- bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
}
- bv->bv_len = PAGE_SIZE;
- bv->bv_offset = 0;
+ return page;
}
void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t bytes)
+ size_t size)
{
bool using_mempool = false;
- BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+ while (size) {
+ struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+ unsigned len = min(PAGE_SIZE, size);
- bio->bi_iter.bi_size = bytes;
-
- while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
- bch2_bio_alloc_page_pool(c, bio, &using_mempool);
+ BUG_ON(!bio_add_page(bio, page, len, 0));
+ size -= len;
+ }
if (using_mempool)
mutex_unlock(&c->bio_bounce_pages_lock);
}
-void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t bytes)
-{
- while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
- BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
-
- bv->bv_page = alloc_page(GFP_NOIO);
- if (!bv->bv_page) {
- /*
- * We already allocated from mempool, we can't allocate from it again
- * without freeing the pages we already allocated or else we could
- * deadlock:
- */
- bch2_bio_free_pages_pool(c, bio);
- bch2_bio_alloc_pages_pool(c, bio, bytes);
- return;
- }
-
- bv->bv_len = PAGE_SIZE;
- bv->bv_offset = 0;
- bio->bi_vcnt++;
- }
-
- bio->bi_iter.bi_size = bytes;
-}
-
/* Writes */
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -287,6 +259,8 @@ int bch2_write_index_default(struct bch_write_op *op)
bch2_verify_keylist_sorted(keys);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+retry:
+ bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(&bch2_keylist_front(keys)->k),
@@ -297,7 +271,9 @@ int bch2_write_index_default(struct bch_write_op *op)
bkey_copy(&split.k, bch2_keylist_front(keys));
- bch2_extent_trim_atomic(&split.k, iter);
+ ret = bch2_extent_trim_atomic(&split.k, iter);
+ if (ret)
+ break;
bch2_trans_update(&trans,
BTREE_INSERT_ENTRY(iter, &split.k));
@@ -314,6 +290,11 @@ int bch2_write_index_default(struct bch_write_op *op)
bch2_keylist_pop_front(keys);
} while (!bch2_keylist_empty(keys));
+ if (ret == -EINTR) {
+ ret = 0;
+ goto retry;
+ }
+
bch2_trans_exit(&trans);
return ret;
@@ -454,7 +435,7 @@ static void init_append_extent(struct bch_write_op *op,
p.ptr.cached = !ca->mi.durability ||
(op->flags & BCH_WRITE_CACHED) != 0;
p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
- bch2_extent_ptr_decoded_append(e, &p);
+ bch2_extent_ptr_decoded_append(&e->k_i, &p);
BUG_ON(crc.compressed_size > ob->sectors_free);
ob->sectors_free -= crc.compressed_size;
@@ -473,7 +454,10 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
struct bio *bio;
unsigned output_available =
min(wp->sectors_free << 9, src->bi_iter.bi_size);
- unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+ unsigned pages = DIV_ROUND_UP(output_available +
+ (buf
+ ? ((unsigned long) buf & (PAGE_SIZE - 1))
+ : 0), PAGE_SIZE);
bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
wbio = wbio_init(bio);
@@ -482,8 +466,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
wbio->bio.bi_opf = src->bi_opf;
if (buf) {
- bio->bi_iter.bi_size = output_available;
- bch2_bio_map(bio, buf);
+ bch2_bio_map(bio, buf, output_available);
return bio;
}
@@ -493,31 +476,17 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
* We can't use mempool for more than c->sb.encoded_extent_max
* worth of pages, but we'd like to allocate more if we can:
*/
- while (bio->bi_iter.bi_size < output_available) {
- unsigned len = min_t(unsigned, PAGE_SIZE,
- output_available - bio->bi_iter.bi_size);
- struct page *p;
-
- p = alloc_page(GFP_NOIO);
- if (!p) {
- unsigned pool_max =
- min_t(unsigned, output_available,
- c->sb.encoded_extent_max << 9);
-
- if (bio_sectors(bio) < pool_max)
- bch2_bio_alloc_pages_pool(c, bio, pool_max);
- break;
- }
+ bch2_bio_alloc_pages_pool(c, bio,
+ min_t(unsigned, output_available,
+ c->sb.encoded_extent_max << 9));
- bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
- .bv_page = p,
- .bv_len = len,
- .bv_offset = 0,
- };
- bio->bi_iter.bi_size += len;
- }
+ if (bio->bi_iter.bi_size < output_available)
+ *page_alloc_failed =
+ bch2_bio_alloc_pages(bio,
+ output_available -
+ bio->bi_iter.bi_size,
+ GFP_NOFS) != 0;
- *page_alloc_failed = bio->bi_vcnt < pages;
return bio;
}
@@ -821,12 +790,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
}
dst->bi_iter.bi_size = total_output;
-
- /* Free unneeded pages after compressing: */
- if (to_wbio(dst)->bounce)
- while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
- mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
- &c->bio_bounce_pages);
do_write:
/* might have done a realloc... */
@@ -952,30 +915,39 @@ flush_io:
void bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v);
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+ if (bio_sectors(bio) & (c->opts.block_size - 1)) {
+ __bcache_io_error(c, "misaligned write");
+ op->error = -EIO;
+ goto err;
+ }
+
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
- wbio_init(&op->wbio.bio)->put_bio = false;
+ wbio_init(bio)->put_bio = false;
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only");
op->error = -EROFS;
- if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
- bch2_disk_reservation_put(c, &op->res);
- closure_return(cl);
- return;
+ goto err;
}
- bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
+ bch2_increment_clock(c, bio_sectors(bio), WRITE);
continue_at_nobarrier(cl, __bch2_write, NULL);
+ return;
+err:
+ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+ bch2_disk_reservation_put(c, &op->res);
+ closure_return(cl);
}
/* Cache promotion on read */
@@ -1003,17 +975,13 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts opts,
unsigned flags)
{
- if (!bkey_extent_is_data(k.k))
- return false;
-
if (!(flags & BCH_READ_MAY_PROMOTE))
return false;
if (!opts.promote_target)
return false;
- if (bch2_extent_has_target(c, bkey_s_c_to_extent(k),
- opts.promote_target))
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
return false;
if (bch2_target_congested(c, opts.promote_target)) {
@@ -1077,25 +1045,22 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
noinline
static struct promote_op *__promote_alloc(struct bch_fs *c,
+ enum btree_id btree_id,
struct bpos pos,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
- unsigned rbio_sectors,
+ unsigned sectors,
struct bch_read_bio **rbio)
{
struct promote_op *op = NULL;
struct bio *bio;
- unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
- /* data might have to be decompressed in the write path: */
- unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
- PAGE_SECTORS);
+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
if (!percpu_ref_tryget(&c->writes))
return NULL;
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
- GFP_NOIO);
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
if (!op)
goto err;
@@ -1103,37 +1068,32 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
op->pos = pos;
/*
- * promotes require bouncing, but if the extent isn't
- * checksummed/compressed it might be too big for the mempool:
+ * We don't use the mempool here because extents that aren't
+ * checksummed or compressed can be too big for the mempool:
*/
- if (rbio_sectors > c->sb.encoded_extent_max) {
- *rbio = kzalloc(sizeof(struct bch_read_bio) +
- sizeof(struct bio_vec) * rbio_pages,
- GFP_NOIO);
- if (!*rbio)
- goto err;
-
- rbio_init(&(*rbio)->bio, opts);
- bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
- rbio_pages);
+ *rbio = kzalloc(sizeof(struct bch_read_bio) +
+ sizeof(struct bio_vec) * pages,
+ GFP_NOIO);
+ if (!*rbio)
+ goto err;
- (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
- bch2_bio_map(&(*rbio)->bio, NULL);
+ rbio_init(&(*rbio)->bio, opts);
+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
- if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
- goto err;
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+ GFP_NOIO))
+ goto err;
- (*rbio)->bounce = true;
- (*rbio)->split = true;
- (*rbio)->kmalloc = true;
- }
+ (*rbio)->bounce = true;
+ (*rbio)->split = true;
+ (*rbio)->kmalloc = true;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
bch_promote_params))
goto err;
bio = &op->write.op.wbio.bio;
- bio_init(bio, bio->bi_inline_vecs, wbio_pages);
+ bio_init(bio, bio->bi_inline_vecs, pages);
ret = bch2_migrate_write_init(c, &op->write,
writepoint_hashed((unsigned long) current),
@@ -1142,6 +1102,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
(struct data_opts) {
.target = opts.promote_target
},
+ btree_id,
bkey_s_c_null);
BUG_ON(ret);
@@ -1167,8 +1128,9 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c,
bool *read_full)
{
bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ /* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
- ? pick->crc.compressed_size
+ ? max(pick->crc.compressed_size, pick->crc.live_size)
: bvec_iter_sectors(iter);
struct bpos pos = promote_full
? bkey_start_pos(k.k)
@@ -1178,7 +1140,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c,
if (!should_promote(c, k, pos, opts, flags))
return NULL;
- promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+ promote = __promote_alloc(c,
+ k.k->type == KEY_TYPE_reflink_v
+ ? BTREE_ID_REFLINK
+ : BTREE_ID_EXTENTS,
+ pos, pick, opts, sectors, rbio);
if (!promote)
return NULL;
@@ -1244,10 +1210,15 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
return rbio;
}
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
- bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
- rbio->start_time);
+ if (rbio->start_time)
+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+ rbio->start_time);
bio_endio(&rbio->bio);
}
@@ -1279,17 +1250,16 @@ retry:
k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
- rbio->pick.ptr,
- rbio->pos.offset -
- rbio->pick.crc.offset)) {
+ if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset)) {
/* extent we wanted to read no longer exists: */
rbio->hole = true;
goto out;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
if (ret == READ_RETRY)
goto retry;
if (ret)
@@ -1312,26 +1282,40 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
struct bkey_s_c k;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
+
+ bch2_trans_init(&trans, c, 0, 0);
retry:
+ bch2_trans_begin(&trans);
+
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
BTREE_ITER_SLOTS, k, ret) {
BKEY_PADDED(k) tmp;
- unsigned bytes;
+ unsigned bytes, sectors, offset_into_extent;
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
+
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(&trans,
+ &offset_into_extent, &tmp.k);
+ if (ret)
+ break;
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
bch2_trans_unlock(&trans);
- bytes = min_t(unsigned, bvec_iter.bi_size,
- (k.k->p.offset - bvec_iter.bi_sector) << 9);
+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
- ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+ offset_into_extent, failed, flags);
switch (ret) {
case READ_RETRY:
goto retry;
@@ -1412,7 +1396,6 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_i_extent *e;
BKEY_PADDED(k) new;
struct bch_extent_crc_unpacked new_crc;
u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
@@ -1431,34 +1414,30 @@ retry:
if (IS_ERR_OR_NULL(k.k))
goto out;
- if (!bkey_extent_is_data(k.k))
- goto out;
-
bkey_reassemble(&new.k, k);
- e = bkey_i_to_extent(&new.k);
+ k = bkey_i_to_s_c(&new.k);
- if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
- rbio->pick.ptr, data_offset) ||
- bversion_cmp(e->k.version, rbio->version))
+ if (bversion_cmp(k.k->version, rbio->version) ||
+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
/* Extent was merged? */
- if (bkey_start_offset(&e->k) < data_offset ||
- e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+ if (bkey_start_offset(k.k) < data_offset ||
+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
goto out;
if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
rbio->pick.crc, NULL, &new_crc,
- bkey_start_offset(&e->k) - data_offset, e->k.size,
+ bkey_start_offset(k.k) - data_offset, k.k->size,
rbio->pick.crc.csum_type)) {
bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
goto out;
}
- if (!bch2_extent_narrow_crcs(e, new_crc))
+ if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
goto out;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
+ bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
@@ -1469,15 +1448,6 @@ out:
bch2_trans_exit(&trans);
}
-static bool should_narrow_crcs(struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- unsigned flags)
-{
- return !(flags & BCH_READ_IN_RETRY) &&
- bkey_extent_is_data(k.k) &&
- bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
-}
-
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
@@ -1512,7 +1482,7 @@ static void __bch2_read_endio(struct work_struct *work)
goto nodecode;
/* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+ crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc.compression_type != BCH_COMPRESSION_NONE) {
@@ -1621,8 +1591,47 @@ static void bch2_read_endio(struct bio *bio)
bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+ unsigned *offset_into_extent,
+ struct bkey_i *orig_k)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ u64 reflink_offset;
+ int ret;
+
+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
+ *offset_into_extent;
+
+ iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+ POS(0, reflink_offset),
+ BTREE_ITER_SLOTS, 1);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_reflink_v) {
+ __bcache_io_error(trans->c,
+ "pointer to nonexistent indirect extent");
+ ret = -EIO;
+ goto err;
+ }
+
+ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+ bkey_reassemble(orig_k, k);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k,
+ unsigned offset_into_extent,
struct bch_io_failures *failed, unsigned flags)
{
struct extent_ptr_decoded pick;
@@ -1655,7 +1664,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
goto hole;
- iter.bi_sector = pos.offset;
iter.bi_size = pick.crc.compressed_size << 9;
goto noclone;
}
@@ -1664,13 +1672,13 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_MUST_CLONE;
- narrow_crcs = should_narrow_crcs(k, &pick, flags);
+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+ bch2_can_narrow_extent_crcs(k, pick.crc);
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
- k.k->p.offset < bvec_iter_end_sector(iter));
+ BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
@@ -1691,19 +1699,30 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
bvec_iter_sectors(iter) != pick.crc.live_size ||
pick.crc.offset ||
- iter.bi_sector != pos.offset));
+ offset_into_extent));
+ pos.offset += offset_into_extent;
pick.ptr.offset += pick.crc.offset +
- (iter.bi_sector - pos.offset);
+ offset_into_extent;
+ offset_into_extent = 0;
pick.crc.compressed_size = bvec_iter_sectors(iter);
pick.crc.uncompressed_size = bvec_iter_sectors(iter);
pick.crc.offset = 0;
pick.crc.live_size = bvec_iter_sectors(iter);
- pos.offset = iter.bi_sector;
+ offset_into_extent = 0;
}
if (rbio) {
- /* promote already allocated bounce rbio */
+ /*
+ * promote already allocated bounce rbio:
+ * promote needs to allocate a bio big enough for uncompressing
+ * data in the write path, but we're not going to use it all
+ * here:
+ */
+ BUG_ON(rbio->bio.bi_iter.bi_size <
+ pick.crc.compressed_size << 9);
+ rbio->bio.bi_iter.bi_size =
+ pick.crc.compressed_size << 9;
} else if (bounce) {
unsigned sectors = pick.crc.compressed_size;
@@ -1745,6 +1764,7 @@ noclone:
else
rbio->end_io = orig->bio.bi_end_io;
rbio->bvec_iter = iter;
+ rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
rbio->narrow_crcs = narrow_crcs;
@@ -1863,45 +1883,67 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
rbio->c = c;
rbio->start_time = local_clock();
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(inode, rbio->bio.bi_iter.bi_sector),
- BTREE_ITER_SLOTS, k, ret) {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(inode, rbio->bio.bi_iter.bi_sector),
+ BTREE_ITER_SLOTS);
+
+ while (1) {
BKEY_PADDED(k) tmp;
- unsigned bytes;
+ unsigned bytes, sectors, offset_into_extent;
+
+ bch2_btree_iter_set_pos(iter,
+ POS(inode, rbio->bio.bi_iter.bi_sector));
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(&trans,
+ &offset_into_extent, &tmp.k);
+ if (ret)
+ goto err;
+
+ /*
+ * With indirect extents, the amount of data to read is the min
+ * of the original extent and the indirect extent:
+ */
+ sectors = min(sectors, k.k->size - offset_into_extent);
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
- bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
- (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- bch2_read_extent(c, rbio, k, flags);
+ bch2_read_extent(c, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
- return;
+ break;
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
}
-
- /*
- * If we get here, it better have been because there was an error
- * reading a btree node
- */
- BUG_ON(!ret);
- bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
-
+out:
bch2_trans_exit(&trans);
+ return;
+err:
+ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_rbio_done(rbio);
+ goto out;
}
void bch2_fs_io_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index fe82c8b81ca5..80b72dbf1a0c 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -13,7 +13,6 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_latency_acct(struct bch_dev *, u64, int);
@@ -96,9 +95,17 @@ struct bch_devs_mask;
struct cache_promote_op;
struct extent_ptr_decoded;
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- struct bkey_s_c, struct bch_io_failures *, unsigned);
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+ struct bkey_i *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+ unsigned *offset_into_extent,
+ struct bkey_i *k)
+{
+ return k->k.type == KEY_TYPE_reflink_p
+ ? __bch2_read_indirect_extent(trans, offset_into_extent, k)
+ : 0;
+}
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
@@ -113,14 +120,22 @@ enum bch_read_flags {
BCH_READ_IN_RETRY = 1 << 7,
};
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+ struct bvec_iter, struct bkey_s_c, unsigned,
+ struct bch_io_failures *, unsigned);
+
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
struct bkey_s_c k,
+ unsigned offset_into_extent,
unsigned flags)
{
- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+ offset_into_extent, NULL, flags);
}
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_io_opts opts)
{
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 04f6d9a7c9a2..2d397e5e5b9e 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -38,6 +38,8 @@ struct bch_read_bio {
*/
struct bvec_iter bvec_iter;
+ unsigned offset_into_extent;
+
u16 flags;
union {
struct {
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index af135e263a3f..387377dadab5 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -495,9 +495,8 @@ reread:
sectors_read << 9));
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = offset;
- bio->bi_iter.bi_size = sectors_read << 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch2_bio_map(bio, buf->data);
+ bch2_bio_map(bio, buf->data, sectors_read << 9);
ret = submit_bio_wait(bio);
bio_put(bio);
@@ -1087,12 +1086,11 @@ void bch2_journal_write(struct closure *cl)
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_iter.bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_WRITE,
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch2_bio_map(bio, jset);
+ bch2_bio_map(bio, jset, sectors << 9);
trace_journal_write(bio);
closure_bio_submit(bio, cl);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index ad41f5e36a7c..dc3b03d6e627 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
return 0;
}
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
+ enum btree_id btree_id)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -44,13 +45,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS_MIN, BTREE_ITER_PREFETCH);
+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+ if (!bch2_bkey_has_device(k, dev_idx)) {
ret = bch2_mark_bkey_replicas(c, k);
if (ret)
break;
@@ -99,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
return ret;
}
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+}
+
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4c82b345b350..0429341ef6fb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -64,13 +64,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- struct bkey_i_extent *insert, *new =
+ struct bkey_i *insert;
+ struct bkey_i_extent *new =
bkey_i_to_extent(bch2_keylist_front(keys));
BKEY_PADDED(k) _new, _insert;
const union bch_extent_entry *entry;
@@ -83,32 +84,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
break;
if (bversion_cmp(k.k->version, new->k.version) ||
- !bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
- m->ptr, m->offset))
+ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
goto nomatch;
if (m->data_cmd == DATA_REWRITE &&
- !bch2_extent_has_device(bkey_s_c_to_extent(k),
- m->data_opts.rewrite_dev))
+ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
goto nomatch;
bkey_reassemble(&_insert.k, k);
- insert = bkey_i_to_extent(&_insert.k);
+ insert = &_insert.k;
bkey_copy(&_new.k, bch2_keylist_front(keys));
new = bkey_i_to_extent(&_new.k);
- bch2_cut_front(iter->pos, &insert->k_i);
+ bch2_cut_front(iter->pos, insert);
bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k);
if (m->data_cmd == DATA_REWRITE)
- bch2_bkey_drop_device(extent_i_to_s(insert).s,
+ bch2_bkey_drop_device(bkey_i_to_s(insert),
m->data_opts.rewrite_dev);
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
- if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
/*
* raced with another move op? extent already
* has a pointer to the device we just wrote
@@ -124,18 +122,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
if (!did_work)
goto nomatch;
- bch2_extent_narrow_crcs(insert,
+ bch2_bkey_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, extent_i_to_s(insert).s);
- bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
- op->opts.background_target,
- op->opts.data_replicas);
+ bch2_extent_normalize(c, bkey_i_to_s(insert));
+ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
+ op->opts.background_target,
+ op->opts.data_replicas);
/*
* If we're not fully overwriting @k, and it's compressed, we
* need a reservation for all the pointers in @insert
*/
- nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+ nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
m->nr_ptrs_reserved;
if (insert->k.size < k.k->size &&
@@ -151,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
}
bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(iter, &insert->k_i));
+ BTREE_INSERT_ENTRY(iter, insert));
ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
@@ -216,10 +214,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
struct bch_io_opts io_opts,
enum data_cmd data_cmd,
struct data_opts data_opts,
+ enum btree_id btree_id,
struct bkey_s_c k)
{
int ret;
+ m->btree_id = btree_id;
m->data_cmd = data_cmd;
m->data_opts = data_opts;
m->nr_ptrs_reserved = 0;
@@ -267,11 +267,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
break;
}
case DATA_REWRITE: {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned compressed_sectors = 0;
- extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
p.crc.compression_type != BCH_COMPRESSION_NONE &&
bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
@@ -301,12 +302,13 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
int i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -394,14 +396,16 @@ static int bch2_move_extent(struct bch_fs *c,
struct moving_context *ctxt,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
- struct bkey_s_c_extent e,
+ enum btree_id btree_id,
+ struct bkey_s_c k,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- unsigned sectors = e.k->size, pages;
+ unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
move_ctxt_wait_event(ctxt,
@@ -413,7 +417,7 @@ static int bch2_move_extent(struct bch_fs *c,
SECTORS_IN_FLIGHT_PER_DEVICE);
/* write path might have to decompress data: */
- extent_for_each_ptr_decode(e, p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
@@ -423,37 +427,37 @@ static int bch2_move_extent(struct bch_fs *c,
goto err;
io->write.ctxt = ctxt;
- io->read_sectors = e.k->size;
- io->write_sectors = e.k->size;
+ io->read_sectors = k.k->size;
+ io->write_sectors = k.k->size;
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->write.op.wbio.bio,
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
- io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
- bch2_bio_map(&io->write.op.wbio.bio, NULL);
- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+ GFP_KERNEL))
goto err_free;
- io->rbio.opts = io_opts;
+ io->rbio.c = c;
+ io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
- data_cmd, data_opts, e.s_c);
+ data_cmd, data_opts, btree_id, k);
if (ret)
goto err_free_pages;
atomic64_inc(&ctxt->stats->keys_moved);
- atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
- trace_move_extent(e.k);
+ trace_move_extent(k.k);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
@@ -463,7 +467,7 @@ static int bch2_move_extent(struct bch_fs *c,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(c, &io->rbio, e.s_c,
+ bch2_read_extent(c, &io->rbio, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@@ -472,20 +476,21 @@ err_free_pages:
err_free:
kfree(io);
err:
- trace_move_alloc_fail(e.k);
+ trace_move_alloc_fail(k.k);
return ret;
}
-int bch2_move_data(struct bch_fs *c,
- struct bch_ratelimit *rate,
- struct write_point_specifier wp,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- struct bch_move_stats *stats)
+static int __bch2_move_data(struct bch_fs *c,
+ struct moving_context *ctxt,
+ struct bch_ratelimit *rate,
+ struct write_point_specifier wp,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ struct bch_move_stats *stats,
+ enum btree_id btree_id)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct moving_context ctxt = { .stats = stats };
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
BKEY_PADDED(k) tmp;
struct btree_trans trans;
@@ -496,17 +501,13 @@ int bch2_move_data(struct bch_fs *c,
u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2;
- closure_init_stack(&ctxt.cl);
- INIT_LIST_HEAD(&ctxt.reads);
- init_waitqueue_head(&ctxt.wait);
-
bch2_trans_init(&trans, c, 0, 0);
stats->data_type = BCH_DATA_USER;
- stats->btree_id = BTREE_ID_EXTENTS;
+ stats->btree_id = btree_id;
stats->pos = POS_MIN;
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+ iter = bch2_trans_get_iter(&trans, btree_id, start,
BTREE_ITER_PREFETCH);
if (rate)
@@ -531,7 +532,7 @@ int bch2_move_data(struct bch_fs *c,
if (unlikely(freezing(current))) {
bch2_trans_unlock(&trans);
- move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
@@ -548,7 +549,7 @@ peek:
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
- if (!bkey_extent_is_data(k.k))
+ if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
if (cur_inum != k.k->p.inode) {
@@ -582,13 +583,12 @@ peek:
k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
- ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
- bkey_s_c_to_extent(k),
+ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(&ctxt);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
@@ -606,7 +606,32 @@ next_nondata:
bch2_trans_cond_resched(&trans);
}
out:
- bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
+
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bch_ratelimit *rate,
+ struct write_point_specifier wp,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ struct bch_move_stats *stats)
+{
+ struct moving_context ctxt = { .stats = stats };
+ int ret;
+
+ closure_init_stack(&ctxt.cl);
+ INIT_LIST_HEAD(&ctxt.reads);
+ init_waitqueue_head(&ctxt.wait);
+
+ stats->data_type = BCH_DATA_USER;
+
+ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end,
+ pred, arg, stats, BTREE_ID_EXTENTS) ?:
+ __bch2_move_data(c, &ctxt, rate, wp, start, end,
+ pred, arg, stats, BTREE_ID_REFLINK);
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 71b3d2b2ddb6..0acd1720d4f8 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -25,6 +25,7 @@ struct data_opts {
};
struct migrate_write {
+ enum btree_id btree_id;
enum data_cmd data_cmd;
struct data_opts data_opts;
@@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
struct write_point_specifier,
struct bch_io_opts,
enum data_cmd, struct data_opts,
- struct bkey_s_c);
+ enum btree_id, struct bkey_s_c);
typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
struct bkey_s_c,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b13af5662f22..710296044194 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -69,26 +69,19 @@ static bool __copygc_pred(struct bch_dev *ca,
struct bkey_s_c k)
{
copygc_heap *h = &ca->copygc_heap;
+ const struct bch_extent_ptr *ptr =
+ bch2_bkey_has_device(k, ca->dev_idx);
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr =
- bch2_extent_has_device(e, ca->dev_idx);
+ if (ptr) {
+ struct copygc_heap_entry search = { .offset = ptr->offset };
- if (ptr) {
- struct copygc_heap_entry search = { .offset = ptr->offset };
+ ssize_t i = eytzinger0_find_le(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, &search);
- ssize_t i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
-
- return (i >= 0 &&
- ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
- ptr->gen == h->data[i].gen);
- }
- break;
- }
+ return (i >= 0 &&
+ ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+ ptr->gen == h->data[i].gen);
}
return false;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c6ec9f7effe5..97a782f44f6e 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -258,6 +258,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don\'t start filesystem, only open devices") \
+ x(reconstruct_alloc, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false, \
+ NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 6bdd68177ac9..84b3fb6eb101 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -38,20 +38,15 @@ void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bkey_s_c_extent e;
-
- if (!bkey_extent_is_data(k.k))
- return;
if (!io_opts->background_target &&
!io_opts->background_compression)
return;
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr_decode(e, p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (rebalance_ptr_pred(c, p, io_opts)) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
@@ -74,30 +69,26 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- /* Make sure we have room to add a new pointer: */
- if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
- BKEY_EXTENT_VAL_U64s_MAX)
- return DATA_SKIP;
-
- extent_for_each_ptr_decode(e, p, entry)
- if (rebalance_ptr_pred(c, p, io_opts))
- goto found;
-
- return DATA_SKIP;
-found:
- data_opts->target = io_opts->background_target;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
- }
- default:
- return DATA_SKIP;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned nr_replicas = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ nr_replicas += !p.ptr.cached;
+
+ if (rebalance_ptr_pred(c, p, io_opts))
+ goto found;
}
+
+ if (nr_replicas < io_opts->data_replicas)
+ goto found;
+
+ return DATA_SKIP;
+found:
+ data_opts->target = io_opts->background_target;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
}
struct rebalance_work {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e0df2c0a4fdf..98d9a1432e50 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -24,6 +24,42 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+/* iterate over keys read from the journal: */
+
+struct journal_iter bch2_journal_iter_init(struct journal_keys *keys,
+ enum btree_id id)
+{
+ return (struct journal_iter) {
+ .keys = keys,
+ .k = keys->d,
+ .btree_id = id,
+ };
+}
+
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+ while (1) {
+ if (iter->k == iter->keys->d + iter->keys->nr)
+ return bkey_s_c_null;
+
+ if (iter->k->btree_id == iter->btree_id)
+ return bkey_i_to_s_c(iter->k->k);
+
+ iter->k++;
+ }
+
+ return bkey_s_c_null;
+}
+
+struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+{
+ if (iter->k == iter->keys->d + iter->keys->nr)
+ return bkey_s_c_null;
+
+ iter->k++;
+ return bch2_journal_iter_peek(iter);
+}
+
/* sort and dedup all keys in the journal: */
static void journal_entries_free(struct list_head *list)
@@ -200,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
+ struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter, *split_iter;
@@ -211,14 +248,21 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i *split;
- bool split_compressed = false;
+ struct bpos atomic_end;
+ /*
+ * Some extents aren't equivalent - w.r.t. what the triggers do
+ * - if they're split:
+ */
+ bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) ||
+ k->k.type == KEY_TYPE_reflink_p;
+ bool remark = false;
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
retry:
bch2_trans_begin(&trans);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, btree_id,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
@@ -237,29 +281,33 @@ retry:
if (ret)
goto err;
- if (!split_compressed &&
- bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
- !bch2_extent_is_atomic(k, split_iter)) {
+ ret = bch2_extent_atomic_end(split_iter, k, &atomic_end);
+ if (ret)
+ goto err;
+
+ if (!remark &&
+ remark_if_split &&
+ bkey_cmp(atomic_end, k->k.p) < 0) {
ret = bch2_disk_reservation_add(c, &disk_res,
k->k.size *
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
- split_compressed = true;
+ remark = true;
}
bkey_copy(split, k);
bch2_cut_front(split_iter->pos, split);
- bch2_extent_trim_atomic(split, split_iter);
+ bch2_cut_back(atomic_end, &split->k);
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
- if (split_compressed) {
+ if (remark) {
ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
- -((s64) k->k.size),
+ 0, -((s64) k->k.size),
BCH_BUCKET_MARK_OVERWRITE) ?:
bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_ATOMIC|
@@ -299,22 +347,17 @@ static int bch2_journal_replay(struct bch_fs *c,
for_each_journal_key(keys, i) {
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
- switch (i->btree_id) {
- case BTREE_ID_ALLOC:
+ if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
- break;
- case BTREE_ID_EXTENTS:
- ret = bch2_extent_replay_key(c, i->k);
- break;
- default:
+ else if (btree_node_type_is_extents(i->btree_id))
+ ret = bch2_extent_replay_key(c, i->btree_id, i->k);
+ else
ret = bch2_btree_insert(c, i->btree_id, i->k,
NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK);
- break;
- }
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
@@ -615,7 +658,7 @@ static int read_btree_roots(struct bch_fs *c)
continue;
if (i == BTREE_ID_ALLOC &&
- test_reconstruct_alloc(c)) {
+ c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
continue;
}
@@ -892,7 +935,9 @@ out:
ret = 0;
err:
fsck_err:
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
bch2_flush_fsck_errs(c);
+
journal_keys_free(&journal_keys);
journal_entries_free(&journal_entries);
kfree(clean);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index a69260d6165a..479ea46f8dcb 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -18,6 +18,17 @@ struct journal_keys {
#define for_each_journal_key(keys, i) \
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
+struct journal_iter {
+ struct journal_keys *keys;
+ struct journal_key *k;
+ enum btree_id btree_id;
+};
+
+struct journal_iter bch2_journal_iter_init(struct journal_keys *,
+ enum btree_id);
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *);
+struct bkey_s_c bch2_journal_iter_next(struct journal_iter *);
+
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
new file mode 100644
index 000000000000..dcca9c1d0f47
--- /dev/null
+++ b/fs/bcachefs/reflink.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "reflink.h"
+
+#include <linux/sched/signal.h>
+
+/* reflink pointers */
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+ if (bkey_val_bytes(p.k) != sizeof(*p.v))
+ return "incorrect value size";
+
+ return NULL;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+}
+
+enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
+ struct bkey_s _l, struct bkey_s _r)
+{
+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+
+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+ return BCH_MERGE_NOMERGE;
+
+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+ bch2_key_resize(l.k, KEY_SIZE_MAX);
+ __bch2_cut_front(l.k->p, _r);
+ return BCH_MERGE_PARTIAL;
+ }
+
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+
+ return BCH_MERGE_MERGE;
+}
+
+/* indirect extents */
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ if (bkey_val_bytes(r.k) < sizeof(*r.v))
+ return "incorrect value size";
+
+ return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+/*
+ * bch2_remap_range() depends on bch2_extent_update(), which depends on various
+ * things tied to the linux vfs for inode updates, for now:
+ */
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i_extent *e)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *reflink_iter;
+ struct bkey_s_c k;
+ struct bkey_i_reflink_v *r_v;
+ struct bkey_i_reflink_p *r_p;
+ int ret;
+
+ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+ POS(0, c->reflink_hint),
+ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+ if (reflink_iter->pos.inode) {
+ bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+ continue;
+ }
+
+ if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+ break;
+ }
+
+ if (ret)
+ goto err;
+
+ /* rewind iter to start of hole, if necessary: */
+ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+
+ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+ ret = PTR_ERR_OR_ZERO(r_v);
+ if (ret)
+ goto err;
+
+ bkey_reflink_v_init(&r_v->k_i);
+ r_v->k.p = reflink_iter->pos;
+ bch2_key_resize(&r_v->k, e->k.size);
+ r_v->k.version = e->k.version;
+
+ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
+ bkey_val_u64s(&e->k));
+ r_v->v.refcount = 0;
+ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+
+ bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
+
+ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
+ if (IS_ERR(r_p))
+ return PTR_ERR(r_p);
+
+ e->k.type = KEY_TYPE_reflink_p;
+ r_p = bkey_i_to_reflink_p(&e->k_i);
+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+ bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
+err:
+ if (!IS_ERR(reflink_iter)) {
+ c->reflink_hint = reflink_iter->pos.offset;
+ bch2_trans_iter_put(trans, reflink_iter);
+ }
+
+ return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+ struct bkey_s_c k = bch2_btree_iter_peek(iter);
+
+ while (1) {
+ if (bkey_err(k))
+ return k;
+
+ if (bkey_cmp(iter->pos, end) >= 0)
+ return bkey_s_c_null;
+
+ if (k.k->type == KEY_TYPE_extent ||
+ k.k->type == KEY_TYPE_reflink_p)
+ return k;
+
+ k = bch2_btree_iter_next(iter);
+ }
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+ struct bch_inode_info *dst_inode,
+ struct bpos dst_start, struct bpos src_start,
+ u64 remap_sectors, u64 new_i_size)
+{
+ struct btree_trans trans;
+ struct btree_iter *dst_iter, *src_iter;
+ struct bkey_s_c src_k;
+ BKEY_PADDED(k) new_dst, new_src;
+ struct bpos dst_end = dst_start, src_end = src_start;
+ struct bpos dst_want, src_want;
+ u64 src_done, dst_done;
+ int ret = 0;
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+ mutex_lock(&c->sb_lock);
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+ c->disk_sb.sb->features[0] |=
+ cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
+
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
+ dst_end.offset += remap_sectors;
+ src_end.offset += remap_sectors;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+
+ src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+ BTREE_ITER_INTENT, 1);
+ dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+ BTREE_ITER_INTENT, 2);
+
+ while (1) {
+ bch2_trans_begin_updates(&trans);
+ trans.mem_top = 0;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto err;
+ }
+
+ src_k = get_next_src(src_iter, src_end);
+ ret = bkey_err(src_k);
+ if (ret)
+ goto btree_err;
+
+ src_done = bpos_min(src_iter->pos, src_end).offset -
+ src_start.offset;
+ dst_want = POS(dst_start.inode, dst_start.offset + src_done);
+
+ if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
+ ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
+ dst_inode, new_i_size);
+ if (ret)
+ goto btree_err;
+ continue;
+ }
+
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
+
+ if (!bkey_cmp(dst_iter->pos, dst_end))
+ break;
+
+ if (src_k.k->type == KEY_TYPE_extent) {
+ bkey_reassemble(&new_src.k, src_k);
+ src_k = bkey_i_to_s_c(&new_src.k);
+
+ bch2_cut_front(src_iter->pos, &new_src.k);
+ bch2_cut_back(src_end, &new_src.k.k);
+
+ ret = bch2_make_extent_indirect(&trans, src_iter,
+ bkey_i_to_extent(&new_src.k));
+ if (ret)
+ goto btree_err;
+
+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+ }
+
+ if (src_k.k->type == KEY_TYPE_reflink_p) {
+ struct bkey_s_c_reflink_p src_p =
+ bkey_s_c_to_reflink_p(src_k);
+ struct bkey_i_reflink_p *dst_p =
+ bkey_reflink_p_init(&new_dst.k);
+
+ u64 offset = le64_to_cpu(src_p.v->idx) +
+ (src_iter->pos.offset -
+ bkey_start_offset(src_k.k));
+
+ dst_p->v.idx = cpu_to_le64(offset);
+ } else {
+ BUG();
+ }
+
+ new_dst.k.k.p = dst_iter->pos;
+ bch2_key_resize(&new_dst.k.k,
+ min(src_k.k->p.offset - src_iter->pos.offset,
+ dst_end.offset - dst_iter->pos.offset));
+
+ ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
+ dst_iter, &new_dst.k,
+ new_i_size, false, true, NULL);
+ if (ret)
+ goto btree_err;
+
+ dst_done = dst_iter->pos.offset - dst_start.offset;
+ src_want = POS(src_start.inode, src_start.offset + dst_done);
+ bch2_btree_iter_set_pos(src_iter, src_want);
+btree_err:
+ if (ret == -EINTR)
+ ret = 0;
+ if (ret)
+ goto err;
+ }
+
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
+err:
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+
+ dst_done = dst_iter->pos.offset - dst_start.offset;
+ new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+
+ ret = bch2_trans_exit(&trans) ?: ret;
+
+ mutex_lock(&dst_inode->ei_update_lock);
+ if (dst_inode->v.i_size < new_i_size) {
+ i_size_write(&dst_inode->v, new_i_size);
+ ret = bch2_write_inode_size(c, dst_inode, new_i_size,
+ ATTR_MTIME|ATTR_CTIME);
+ }
+ mutex_unlock(&dst_inode->ei_update_lock);
+
+ return dst_done ?: ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
new file mode 100644
index 000000000000..327618c36d33
--- /dev/null
+++ b/fs/bcachefs/reflink.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+enum merge_result bch2_reflink_p_merge(struct bch_fs *,
+ struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
+ .key_invalid = bch2_reflink_p_invalid, \
+ .val_to_text = bch2_reflink_p_to_text, \
+ .key_merge = bch2_reflink_p_merge, \
+}
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+
+
+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
+ .key_invalid = bch2_reflink_v_invalid, \
+ .val_to_text = bch2_reflink_v_to_text, \
+}
+
+#ifndef NO_BCACHEFS_FS
+s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
+ struct bpos, struct bpos, u64, u64);
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 4818453c015a..bb9da2bb5a92 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -16,11 +16,16 @@ static inline int u8_cmp(u8 l, u8 r)
return cmp_int(l, r);
}
-static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry *e)
{
-#ifdef CONFIG_BCACHES_DEBUG
+#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
+ BUG_ON(e->data_type >= BCH_DATA_NR);
+ BUG_ON(!e->nr_devs);
+ BUG_ON(e->nr_required > 1 &&
+ e->nr_required >= e->nr_devs);
+
for (i = 0; i + 1 < e->nr_devs; i++)
BUG_ON(e->devs[i] >= e->devs[i + 1]);
#endif
@@ -80,7 +85,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (p.ec_nr) {
- r->nr_devs = 0;
+ r->nr_required = 0;
break;
}
@@ -113,6 +118,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
extent_to_replicas(k, e);
break;
case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
e->data_type = BCH_DATA_USER;
extent_to_replicas(k, e);
break;
@@ -157,7 +163,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
};
BUG_ON(!new_entry->data_type);
- verify_replicas_entry_sorted(new_entry);
+ verify_replicas_entry(new_entry);
new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
if (!new.entries)
@@ -184,7 +190,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
if (unlikely(entry_size > r->entry_size))
return -1;
- verify_replicas_entry_sorted(search);
+ verify_replicas_entry(search);
#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
@@ -215,7 +221,7 @@ static bool bch2_replicas_marked_locked(struct bch_fs *c,
if (!search->nr_devs)
return true;
- verify_replicas_entry_sorted(search);
+ verify_replicas_entry(search);
return __replicas_has_entry(&c->replicas, search) &&
(!check_gc_replicas ||
@@ -359,6 +365,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_cpu new_r, new_gc;
int ret = -ENOMEM;
+ verify_replicas_entry(new_entry);
+
memset(&new_r, 0, sizeof(new_r));
memset(&new_gc, 0, sizeof(new_gc));
@@ -874,9 +882,8 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi
goto err;
err = "invalid replicas entry: bad nr_required";
- if (!e->nr_required ||
- (e->nr_required > 1 &&
- e->nr_required >= e->nr_devs))
+ if (e->nr_required > 1 &&
+ e->nr_required >= e->nr_devs)
goto err;
err = "invalid replicas entry: invalid device";
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 1779f755b21d..091bf7a89577 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -72,7 +72,7 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
break;
case BCH_STR_HASH_CRC64:
- ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
+ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
break;
case BCH_STR_HASH_SIPHASH:
SipHash24_Init(&ctx->siphash, &info->siphash_key);
@@ -91,7 +91,7 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
ctx->crc32c = crc32c(ctx->crc32c, data, len);
break;
case BCH_STR_HASH_CRC64:
- ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len);
+ ctx->crc64 = crc64_be(ctx->crc64, data, len);
break;
case BCH_STR_HASH_SIPHASH:
SipHash24_Update(&ctx->siphash, data, len);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 5e1ae7e425ff..3043def884ab 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -470,9 +470,8 @@ reread:
bio_reset(sb->bio);
bio_set_dev(sb->bio, sb->bdev);
sb->bio->bi_iter.bi_sector = offset;
- sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
- bch2_bio_map(sb->bio, sb->sb);
+ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order);
if (submit_bio_wait(sb->bio))
return "IO error";
@@ -574,13 +573,12 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
bio_reset(sb->bio);
bio_set_dev(sb->bio, sb->bdev);
sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
/*
* use sb buffer to read layout, since sb buffer is page aligned but
* layout won't be:
*/
- bch2_bio_map(sb->bio, sb->sb);
+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
err = "IO error";
if (submit_bio_wait(sb->bio))
@@ -650,11 +648,10 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
- bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
- bch2_bio_map(bio, ca->sb_read_scratch);
+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB],
bio_sectors(bio));
@@ -677,13 +674,12 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
- bio->bi_iter.bi_size =
- roundup((size_t) vstruct_bytes(sb),
- bdev_logical_block_size(ca->disk_sb.bdev));
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
- bch2_bio_map(bio, sb);
+ bch2_bio_map(bio, sb,
+ roundup((size_t) vstruct_bytes(sb),
+ bdev_logical_block_size(ca->disk_sb.bdev)));
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
bio_sectors(bio));
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7e1b1bf43c31..4145832f4856 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -309,6 +309,7 @@ void bch2_fs_read_only(struct bch_fs *c)
*/
percpu_ref_kill(&c->writes);
+ cancel_work_sync(&c->ec_stripe_delete_work);
cancel_delayed_work(&c->pd_controllers_update);
/*
@@ -398,6 +399,8 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+ schedule_work(&c->ec_stripe_delete_work);
+
return 0;
}
@@ -491,6 +494,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
+ bch2_fs_btree_iter_exit(c);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
@@ -502,7 +506,6 @@ static void bch2_fs_free(struct bch_fs *c)
free_percpu(c->usage[0]);
kfree(c->usage_base);
free_percpu(c->pcpu);
- mempool_exit(&c->btree_iters_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
mempool_exit(&c->btree_interior_update_pool);
@@ -755,15 +758,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
- mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
- sizeof(struct btree_iter) * BTREE_ITER_MAX +
- sizeof(struct btree_insert_entry) *
- (BTREE_ITER_MAX + 4)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
bch2_fs_replicas_init(c) ||
bch2_fs_btree_cache_init(c) ||
+ bch2_fs_btree_iter_init(c) ||
bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 2aa3097aeedb..2cc433ec0e3a 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -504,48 +504,32 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
/* misc: */
-void bch2_bio_map(struct bio *bio, void *base)
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
{
- size_t size = bio->bi_iter.bi_size;
- struct bio_vec *bv = bio->bi_io_vec;
-
- BUG_ON(!bio->bi_iter.bi_size);
- BUG_ON(bio->bi_vcnt);
- BUG_ON(!bio->bi_max_vecs);
-
- bv->bv_offset = base ? offset_in_page(base) : 0;
- goto start;
-
- for (; size; bio->bi_vcnt++, bv++) {
- BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
-
- bv->bv_offset = 0;
-start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
- size);
- if (base) {
- bv->bv_page = is_vmalloc_addr(base)
+ while (size) {
+ struct page *page = is_vmalloc_addr(base)
? vmalloc_to_page(base)
: virt_to_page(base);
+ unsigned offset = offset_in_page(base);
+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
- base += bv->bv_len;
- }
-
- size -= bv->bv_len;
+ BUG_ON(!bio_add_page(bio, page, len, offset));
+ size -= len;
+ base += len;
}
}
-int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
{
- struct bio_vec *bv;
- int i;
+ while (size) {
+ struct page *page = alloc_page(gfp_mask);
+ unsigned len = min(PAGE_SIZE, size);
- bio_for_each_segment_all(bv, bio, i) {
- bv->bv_page = alloc_page(gfp_mask);
- if (!bv->bv_page) {
- while (--bv >= bio->bi_io_vec)
- __free_page(bv->bv_page);
+ if (!page)
return -ENOMEM;
- }
+
+ BUG_ON(!bio_add_page(bio, page, len, 0));
+ size -= len;
}
return 0;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 310e958c6cdf..fa3a991453e9 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -511,8 +511,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
return x;
}
-void bch2_bio_map(struct bio *bio, void *base);
-int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
+void bch2_bio_map(struct bio *bio, void *base, size_t);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
@@ -628,35 +628,6 @@ static inline void memmove_u64s(void *dst, const void *src,
__memmove_u64s_up(dst, src, u64s);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));