summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-12-08 22:31:09 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2024-12-29 14:57:48 -0500
commit1055935ffe151de39c45e33ea13d3370e46c8fbd (patch)
treedaf0f052febd666eb5b0be3675226d14b4dde8e3
parent634c812a1ed05de8e3d1dc146eed95b942e1e38d (diff)
Update bcachefs sources to 864591728963 bcachefs: Dropped superblock write is no longer a fatal error
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile.compiler2
-rw-r--r--include/linux/blk_types.h1
-rw-r--r--include/linux/kmemleak.h4
-rw-r--r--include/linux/kobject.h1
-rw-r--r--libbcachefs/alloc_background.c2
-rw-r--r--libbcachefs/alloc_foreground.c4
-rw-r--r--libbcachefs/backpointers.c142
-rw-r--r--libbcachefs/bcachefs.h7
-rw-r--r--libbcachefs/bcachefs_format.h51
-rw-r--r--libbcachefs/btree_gc.c29
-rw-r--r--libbcachefs/btree_gc.h4
-rw-r--r--libbcachefs/btree_io.c32
-rw-r--r--libbcachefs/btree_io.h6
-rw-r--r--libbcachefs/btree_iter.c86
-rw-r--r--libbcachefs/btree_iter.h29
-rw-r--r--libbcachefs/btree_key_cache.c58
-rw-r--r--libbcachefs/btree_locking.c14
-rw-r--r--libbcachefs/btree_locking.h44
-rw-r--r--libbcachefs/btree_node_scan.c2
-rw-r--r--libbcachefs/btree_trans_commit.c89
-rw-r--r--libbcachefs/btree_types.h39
-rw-r--r--libbcachefs/btree_update.c13
-rw-r--r--libbcachefs/btree_update.h1
-rw-r--r--libbcachefs/btree_update_interior.c36
-rw-r--r--libbcachefs/btree_write_buffer.c10
-rw-r--r--libbcachefs/buckets.c57
-rw-r--r--libbcachefs/buckets.h9
-rw-r--r--libbcachefs/clock.c25
-rw-r--r--libbcachefs/data_update.c2
-rw-r--r--libbcachefs/disk_accounting.c48
-rw-r--r--libbcachefs/disk_accounting.h4
-rw-r--r--libbcachefs/ec.c25
-rw-r--r--libbcachefs/errcode.h5
-rw-r--r--libbcachefs/fs-common.c28
-rw-r--r--libbcachefs/fs-io-buffered.c9
-rw-r--r--libbcachefs/fsck.c93
-rw-r--r--libbcachefs/inode.c31
-rw-r--r--libbcachefs/inode_format.h6
-rw-r--r--libbcachefs/io_write.c6
-rw-r--r--libbcachefs/journal_io.c4
-rw-r--r--libbcachefs/move.c4
-rw-r--r--libbcachefs/opts.h7
-rw-r--r--libbcachefs/recovery.c33
-rw-r--r--libbcachefs/recovery_passes_types.h2
-rw-r--r--libbcachefs/sb-counters_format.h165
-rw-r--r--libbcachefs/sb-downgrade.c1
-rw-r--r--libbcachefs/six.c17
-rw-r--r--libbcachefs/six.h1
-rw-r--r--libbcachefs/snapshot.c455
-rw-r--r--libbcachefs/snapshot.h10
-rw-r--r--libbcachefs/str_hash.c127
-rw-r--r--libbcachefs/str_hash.h25
-rw-r--r--libbcachefs/subvolume.c50
-rw-r--r--libbcachefs/subvolume_types.h2
-rw-r--r--libbcachefs/super-io.c11
-rw-r--r--libbcachefs/super.c8
-rw-r--r--libbcachefs/sysfs.c14
-rw-r--r--libbcachefs/trace.h43
-rw-r--r--libbcachefs/util.h10
-rw-r--r--libbcachefs/varint.c5
61 files changed, 1171 insertions, 879 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 64016eff..5d86ae9c 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-55a65a994ed5fba038fda00f78416faf6f308bb8
+864591728963d416c49e502bfee56a283eda31a5
diff --git a/Makefile.compiler b/Makefile.compiler
index e0842496..8c102968 100644
--- a/Makefile.compiler
+++ b/Makefile.compiler
@@ -13,7 +13,7 @@ cc-cross-prefix = $(firstword $(foreach c, $(1), \
$(if $(shell command -v -- $(c)gcc 2>/dev/null), $(c))))
# output directory for tests below
-TMPOUT = $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_$$$$
+TMPOUT = .tmp_$$$$
# try-run
# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3cbf8c9e..2384a5e3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -6,6 +6,7 @@
#define __LINUX_BLK_TYPES_H
#include <linux/atomic.h>
+#include <linux/backing-dev.h>
#include <linux/types.h>
#include <linux/bvec.h>
#include <linux/kobject.h>
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 6a3cd1bf..93a73c07 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
extern void kmemleak_update_trace(const void *ptr) __ref;
extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_transient_leak(const void *ptr) __ref;
extern void kmemleak_ignore(const void *ptr) __ref;
extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
extern void kmemleak_no_scan(const void *ptr) __ref;
@@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr)
static inline void kmemleak_not_leak(const void *ptr)
{
}
+static inline void kmemleak_transient_leak(const void *ptr)
+{
+}
static inline void kmemleak_ignore(const void *ptr)
{
}
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c33b2126..24096a62 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -20,6 +20,7 @@
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/workqueue.h>
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 7641a3b4..94e7bc88 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -933,8 +933,6 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
u64 transaction_seq = trans->journal_res.seq;
BUG_ON(!transaction_seq);
- BUG_ON(transaction_seq < new_a->journal_seq_nonempty);
- BUG_ON(transaction_seq < new_a->journal_seq_empty);
if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
trans, alloc_key_journal_seq_in_future,
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 57d5f14c..6df41c33 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
return;
}
- percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
-
ob->valid = false;
ob->data_type = 0;
-
spin_unlock(&ob->lock);
- percpu_up_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
bch2_open_bucket_hash_remove(c, ob);
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index 72311dc7..ebeb6a5f 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -178,7 +178,7 @@ static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
-static int bch2_backpointers_maybe_flush(struct btree_trans *trans,
+static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
struct bkey_s_c visiting_k,
struct bkey_buf *last_flushed)
{
@@ -201,17 +201,30 @@ static int backpointer_target_not_found(struct btree_trans *trans,
* looking at may have already been deleted - failure to find what it
* pointed to is not an error:
*/
- ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed);
+ ret = last_flushed
+ ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
+ : 0;
if (ret)
return ret;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
bp.v->level ? "btree node" : "extent");
bch2_bkey_val_to_text(&buf, c, bp.s_c);
- prt_printf(&buf, "\n ");
+ prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, target_k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
+ if (p.ptr.dev == bp.k->p.inode) {
+ prt_printf(&buf, "\n ");
+ struct bkey_i_backpointer bp2;
+ bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
+ }
+
if (fsck_err(trans, backpointer_to_missing_ptr,
"%s", buf.buf))
ret = bch2_backpointer_del(trans, bp.k->p);
@@ -491,7 +504,7 @@ check_existing_bp:
struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
struct bkey_s_c other_extent =
- bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, &s->last_flushed);
+ bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
ret = bkey_err(other_extent);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
ret = 0;
@@ -553,11 +566,11 @@ check_existing_bp:
goto err;
missing:
printbuf_reset(&buf);
- prt_str(&buf, "missing backpointer ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
- prt_newline(&buf);
+ prt_str(&buf, "missing backpointer\n for: ");
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\n got: ");
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
+ prt_printf(&buf, "\n got: ");
bch2_bkey_val_to_text(&buf, c, bp_k);
if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
@@ -586,12 +599,16 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
+ bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
rcu_read_unlock();
- if (check) {
+ if (check || empty) {
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
- int ret = check_bp_exists(trans, s, &bp, k);
+
+ int ret = check
+ ? check_bp_exists(trans, s, &bp, k)
+ : bch2_bucket_backpointer_mod(trans, k, &bp, true);
if (ret)
return ret;
}
@@ -825,12 +842,15 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t
}
}
-static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, struct bkey_s_c alloc_k,
- struct bkey_buf *last_flushed)
+static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
+
+static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
+ struct bkey_buf *last_flushed)
{
- int ret = 0;
+ struct bch_fs *c = trans->c;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ bool need_commit = false;
if (a->data_type == BCH_DATA_sb ||
a->data_type == BCH_DATA_journal ||
@@ -846,6 +866,7 @@ static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, stru
struct btree_iter iter;
struct bkey_s_c bp_k;
+ int ret = 0;
for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
bucket_pos_to_bp_start(ca, alloc_k.k->p),
bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
@@ -854,6 +875,17 @@ static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, stru
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+ (bp.v->bucket_gen != a->gen ||
+ bp.v->pad)) {
+ ret = bch2_backpointer_del(trans, bp_k.k->p);
+ if (ret)
+ break;
+
+ need_commit = true;
+ continue;
+ }
+
if (bp.v->bucket_gen != a->gen)
continue;
@@ -863,30 +895,40 @@ static int check_bucket_backpointer_mismatch_one(struct btree_trans *trans, stru
if (ret)
goto err;
+ if (need_commit) {
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+ }
+
/* Cached pointers don't have backpointers: */
- if (sectors[ALLOC_dirty] != a->dirty_sectors ||
+ if (sectors[ALLOC_dirty] != a->dirty_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
- ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
- if (ret)
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
+ ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
+ if (ret)
+ goto err;
+ }
+
+ if (sectors[ALLOC_dirty] > a->dirty_sectors ||
+ sectors[ALLOC_stripe] > a->stripe_sectors) {
+ ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
+ -BCH_ERR_transaction_restart_nested;
goto err;
+ }
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
+ if (!sectors[ALLOC_dirty] &&
+ !sectors[ALLOC_stripe])
+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
+ else
+ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
}
err:
bch2_dev_put(ca);
return ret;
}
-static int check_bucket_backpointer_mismatches(struct btree_trans *trans,
- struct bkey_buf *last_flushed)
-{
- return for_each_btree_key(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_prefetch, k, ({
- check_bucket_backpointer_mismatch_one(trans, k, last_flushed);
- }));
-}
-
static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
{
switch (k.k->type) {
@@ -896,6 +938,9 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
rcu_read_lock();
struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
while (pos.inode <= k.k->p.inode) {
+ if (pos.inode >= c->sb.nr_devices)
+ break;
+
struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
if (!ca)
goto next;
@@ -941,7 +986,7 @@ err:
}
static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
+ struct bpos start, struct bpos *end)
{
struct bch_fs *c = trans->c;
int ret = 0;
@@ -1022,7 +1067,11 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
sizeof(unsigned long),
GFP_KERNEL);
- if (!ca->bucket_backpointer_mismatches) {
+ ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!ca->bucket_backpointer_mismatches ||
+ !ca->bucket_backpointer_empty) {
bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
goto err_free_bitmaps;
@@ -1035,21 +1084,25 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_bkey_buf_init(&s.last_flushed);
bkey_init(&s.last_flushed.k->k);
- ret = check_bucket_backpointer_mismatches(trans, &s.last_flushed);
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
+ POS_MIN, BTREE_ITER_prefetch, k, ({
+ check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
+ }));
if (ret)
goto err;
- u64 nr_buckets = 0, nr_mismatches = 0;
+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
for_each_member_device(c, ca) {
- nr_buckets += ca->mi.nbuckets;
- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
+ nr_buckets += ca->mi.nbuckets;
+ nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
+ nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
}
- if (!nr_mismatches)
+ if (!nr_mismatches && !nr_empty)
goto err;
bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches, nr_buckets);
+ nr_mismatches + nr_empty, nr_buckets);
while (1) {
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
@@ -1086,6 +1139,8 @@ err:
bch2_btree_cache_unpin(c);
err_free_bitmaps:
for_each_member_device(c, ca) {
+ kvfree(ca->bucket_backpointer_empty);
+ ca->bucket_backpointer_empty = NULL;
kvfree(ca->bucket_backpointer_mismatches);
ca->bucket_backpointer_mismatches = NULL;
}
@@ -1122,6 +1177,25 @@ static int check_one_backpointer(struct btree_trans *trans,
return ret;
}
+static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
+ struct bch_dev *ca, struct bpos bucket)
+{
+ u32 restart_count = trans->restart_count;
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_start(ca, bucket),
+ bucket_pos_to_bp_end(ca, bucket),
+ 0, k,
+ check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
+ );
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index d27550ef..161cf2f0 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -547,17 +547,16 @@ struct bch_dev {
/*
* Buckets:
- * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
- * gc_gens_lock, for device resize - holding any is sufficient for
- * access: Or rcu_read_lock(), but only for dev_ptr_stale():
+ * Per-bucket arrays are protected by either rcu_read_lock or
+ * state_lock, for device resize.
*/
GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
- struct rw_semaphore bucket_lock;
unsigned long *bucket_backpointer_mismatches;
+ unsigned long *bucket_backpointer_empty;
struct bch_dev_usage __percpu *usage;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 09e53bef..06809305 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -684,7 +684,8 @@ struct bch_sb_field_ext {
x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
x(inode_depth, BCH_VERSION(1, 17)) \
- x(persistent_inode_cursors, BCH_VERSION(1, 18))
+ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
+ x(autofix_errors, BCH_VERSION(1, 19))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1287,14 +1288,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
/* Btree: */
enum btree_id_flags {
- BTREE_ID_EXTENTS = BIT(0),
- BTREE_ID_SNAPSHOTS = BIT(1),
- BTREE_ID_SNAPSHOT_FIELD = BIT(2),
- BTREE_ID_DATA = BIT(3),
+ BTREE_IS_extents = BIT(0),
+ BTREE_IS_snapshots = BIT(1),
+ BTREE_IS_snapshot_field = BIT(2),
+ BTREE_IS_data = BIT(3),
+ BTREE_IS_write_buffer = BIT(4),
};
#define BCH_BTREE_IDS() \
- x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+ x(extents, 0, \
+ BTREE_IS_extents| \
+ BTREE_IS_snapshots| \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_error)| \
BIT_ULL(KEY_TYPE_cookie)| \
@@ -1302,17 +1307,20 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_reservation)| \
BIT_ULL(KEY_TYPE_reflink_p)| \
BIT_ULL(KEY_TYPE_inline_data)) \
- x(inodes, 1, BTREE_ID_SNAPSHOTS, \
+ x(inodes, 1, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_inode)| \
BIT_ULL(KEY_TYPE_inode_v2)| \
BIT_ULL(KEY_TYPE_inode_v3)| \
BIT_ULL(KEY_TYPE_inode_generation)) \
- x(dirents, 2, BTREE_ID_SNAPSHOTS, \
+ x(dirents, 2, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_hash_whiteout)| \
BIT_ULL(KEY_TYPE_dirent)) \
- x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
+ x(xattrs, 3, \
+ BTREE_IS_snapshots, \
BIT_ULL(KEY_TYPE_whiteout)| \
BIT_ULL(KEY_TYPE_cookie)| \
BIT_ULL(KEY_TYPE_hash_whiteout)| \
@@ -1326,7 +1334,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_quota)) \
x(stripes, 6, 0, \
BIT_ULL(KEY_TYPE_stripe)) \
- x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
+ x(reflink, 7, \
+ BTREE_IS_extents| \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_reflink_v)| \
BIT_ULL(KEY_TYPE_indirect_inline_data)| \
BIT_ULL(KEY_TYPE_error)) \
@@ -1334,29 +1344,38 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \
BIT_ULL(KEY_TYPE_snapshot)) \
- x(lru, 10, 0, \
+ x(lru, 10, \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
- x(freespace, 11, BTREE_ID_EXTENTS, \
+ x(freespace, 11, \
+ BTREE_IS_extents, \
BIT_ULL(KEY_TYPE_set)) \
x(need_discard, 12, 0, \
BIT_ULL(KEY_TYPE_set)) \
- x(backpointers, 13, 0, \
+ x(backpointers, 13, \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_backpointer)) \
x(bucket_gens, 14, 0, \
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
+ x(deleted_inodes, 16, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)| \
BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
- x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
+ x(rebalance_work, 18, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
BIT_ULL(KEY_TYPE_set)) \
- x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
+ x(accounting, 20, \
+ BTREE_IS_snapshot_field| \
+ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_accounting)) \
enum btree_id {
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 24f2f3bd..dd1d9b74 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -733,16 +733,8 @@ static int bch2_gc_btrees(struct bch_fs *c)
continue;
ret = bch2_gc_btree(trans, btree, true);
-
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- trans, btree_node_read_error,
- "btree node read error for %s",
- (printbuf_reset(&buf),
- bch2_btree_id_to_text(&buf, btree),
- buf.buf)))
- ret = bch2_btree_lost_data(c, btree);
}
-fsck_err:
+
printbuf_exit(&buf);
bch2_trans_put(trans);
bch_err_fn(c, ret);
@@ -811,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
old = bch2_alloc_to_v4(k, &old_convert);
gc = new = *old;
- percpu_down_read(&c->mark_lock);
__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
old_gc = gc;
@@ -822,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
gc.data_type = old->data_type;
gc.dirty_sectors = old->dirty_sectors;
}
- percpu_up_read(&c->mark_lock);
/*
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
@@ -840,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
* safe w.r.t. transaction restarts, so fixup the gc_bucket so
* we don't run it twice:
*/
- percpu_down_read(&c->mark_lock);
struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
gc_m->data_type = gc.data_type;
gc_m->dirty_sectors = gc.dirty_sectors;
- percpu_up_read(&c->mark_lock);
}
if (fsck_err_on(new.data_type != gc.data_type,
@@ -1088,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
return -EROFS;
- percpu_down_read(&c->mark_lock);
rcu_read_lock();
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
@@ -1097,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
if (dev_ptr_stale(ca, ptr) > 16) {
rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
goto update;
}
}
@@ -1112,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
*gen = ptr->gen;
}
rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
return 0;
update:
u = bch2_bkey_make_mut(trans, iter, &k, 0);
@@ -1141,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev
return ret;
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
- alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
@@ -1254,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
}
-void bch2_fs_gc_init(struct bch_fs *c)
+void bch2_fs_btree_gc_exit(struct bch_fs *c)
{
- seqcount_init(&c->gc_pos_lock);
+}
+int bch2_fs_btree_gc_init(struct bch_fs *c)
+{
+ seqcount_init(&c->gc_pos_lock);
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
+
+ init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
+ return 0;
}
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 8a47e8bd..9693a90a 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_gens_async(struct bch_fs *);
-void bch2_fs_gc_init(struct bch_fs *);
+
+void bch2_fs_btree_gc_exit(struct bch_fs *);
+int bch2_fs_btree_gc_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index d99f8a78..e371e60e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -489,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
if (b->nsets == MAX_BSETS &&
!btree_node_write_in_flight(b) &&
should_compact_all(c, b)) {
- bch2_btree_node_write(c, b, SIX_LOCK_write,
- BTREE_WRITE_init_next_bset);
+ bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
+ BTREE_WRITE_init_next_bset);
reinit_iter = true;
}
@@ -2345,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
}
}
+void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
+ enum six_lock_type lock_type_held,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+
+ if (lock_type_held == SIX_LOCK_intent ||
+ (lock_type_held == SIX_LOCK_read &&
+ six_lock_tryupgrade(&b->c.lock))) {
+ __bch2_btree_node_write(c, b, flags);
+
+ /* don't cycle lock unnecessarily: */
+ if (btree_node_just_written(b) &&
+ six_trylock_write(&b->c.lock)) {
+ bch2_btree_post_write_cleanup(c, b);
+ __bch2_btree_node_unlock_write(trans, b);
+ }
+
+ if (lock_type_held == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
+ } else {
+ __bch2_btree_node_write(c, b, flags);
+ if (lock_type_held == SIX_LOCK_write &&
+ btree_node_just_written(b))
+ bch2_btree_post_write_cleanup(c, b);
+ }
+}
+
static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
{
struct bucket_table *tbl;
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 9b01ca3d..6f9e4a6d 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -144,11 +144,13 @@ enum btree_write_flags {
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type, unsigned);
+void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
+ enum six_lock_type, unsigned);
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
enum six_lock_type lock_held)
{
- bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+ bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
}
bool bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 9c54891c..c291d495 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -699,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans,
bch2_trans_revalidate_updates_in_node(trans, b);
}
+void bch2_trans_node_drop(struct btree_trans *trans,
+ struct btree *b)
+{
+ struct btree_path *path;
+ unsigned i, level = b->c.level;
+
+ trans_for_each_path(trans, path, i)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
/*
* A btree node has been modified in such a way as to invalidate iterators - fix
* them:
@@ -1854,7 +1867,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
!bkey_eq(path->pos, ck->key.pos));
*u = ck->k->k;
- k = bkey_i_to_s_c(ck->k);
+ k = (struct bkey_s_c) { u, &ck->k->v };
}
return k;
@@ -2144,21 +2157,18 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
}
static noinline
-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+void btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(path)->b->key.k.p);
-
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
- k = bkey_i_to_s_c(next_journal);
+ *k = bkey_i_to_s_c(next_journal);
}
-
- return k;
}
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
@@ -2175,21 +2185,19 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
}
static noinline
-struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+void btree_trans_peek_prev_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek_prev(trans, iter,
- k.k ? k.k->p : path_l(path)->b->key.k.p);
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
- k = bkey_i_to_s_c(next_journal);
+ *k = bkey_i_to_s_c(next_journal);
}
-
- return k;
}
/*
@@ -2234,10 +2242,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
- if (k.k && !bkey_err(k)) {
- iter->k = u;
- k.k = &iter->k;
- }
+ if (!k.k)
+ return k;
+
+ if ((iter->flags & BTREE_ITER_all_snapshots) &&
+ !bpos_eq(pos, k.k->p))
+ return bkey_s_c_null;
+
+ iter->k = u;
+ k.k = &iter->k;
return k;
}
@@ -2260,7 +2273,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out;
+ break;
}
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2270,7 +2283,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* No btree nodes at requested level: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
btree_path_set_should_be_locked(trans, path);
@@ -2281,15 +2294,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
k = k2;
- ret = bkey_err(k);
- if (ret) {
+ if (bkey_err(k)) {
bch2_btree_iter_set_pos(iter, iter->pos);
- goto out;
+ break;
}
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- k = btree_trans_peek_journal(trans, iter, k);
+ btree_trans_peek_journal(trans, iter, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
@@ -2318,12 +2330,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
/* End of btree: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
}
-out:
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
@@ -2424,7 +2435,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
continue;
}
- if (bkey_whiteout(k.k)) {
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)) {
search_key = bkey_successor(iter, k.k->p);
continue;
}
@@ -2547,7 +2559,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- k = btree_trans_peek_prev_journal(trans, iter, k);
+ btree_trans_peek_prev_journal(trans, iter, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
@@ -2784,6 +2796,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
+
+ if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+ (iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)))
+ iter->k.type = KEY_TYPE_deleted;
} else {
struct bpos next;
struct bpos end = iter->pos;
@@ -3028,7 +3045,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
unsigned flags)
{
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_RET_IP_);
}
@@ -3044,8 +3061,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
flags |= BTREE_ITER_snapshot_field;
flags |= BTREE_ITER_all_snapshots;
+ if (!depth && btree_id_cached(trans->c, btree_id))
+ flags |= BTREE_ITER_with_key_cache;
+
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
- __bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, depth, flags),
_RET_IP_);
iter->min_depth = depth;
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 3477fc8c..b9538e6e 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -372,6 +372,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *);
void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
+void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
@@ -446,10 +447,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned flags)
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned level,
+ unsigned flags)
{
+ if (level || !btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_cached;
+ flags &= ~BTREE_ITER_with_key_cache;
+ } else if (!(flags & BTREE_ITER_cached))
+ flags |= BTREE_ITER_with_key_cache;
+
if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
btree_id_is_extents(btree_id))
flags |= BTREE_ITER_is_extents;
@@ -468,19 +476,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
return flags;
}
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned flags)
-{
- if (!btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_cached;
- flags &= ~BTREE_ITER_with_key_cache;
- } else if (!(flags & BTREE_ITER_cached))
- flags |= BTREE_ITER_with_key_cache;
-
- return __bch2_btree_iter_flags(trans, btree_id, flags);
-}
-
static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
@@ -517,7 +512,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
if (__builtin_constant_p(btree_id) &&
__builtin_constant_p(flags))
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_THIS_IP_);
else
bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 3bd40ea0..7636a5e9 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -197,7 +197,9 @@ out:
return ck;
}
-static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+static int btree_key_cache_create(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_path *ck_path,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
key_u64s = min(256U, (key_u64s * 3) / 2);
key_u64s = roundup_pow_of_two(key_u64s);
- struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+ struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
int ret = PTR_ERR_OR_ZERO(ck);
if (ret)
return ret;
@@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_id_str(path->btree_id));
+ bch2_btree_id_str(ck_path->btree_id));
return -BCH_ERR_ENOMEM_btree_key_cache_create;
}
}
ck->c.level = 0;
- ck->c.btree_id = path->btree_id;
- ck->key.btree_id = path->btree_id;
- ck->key.pos = path->pos;
+ ck->c.btree_id = ck_path->btree_id;
+ ck->key.btree_id = ck_path->btree_id;
+ ck->key.pos = ck_path->pos;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
if (unlikely(key_u64s > ck->u64s)) {
- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
kmalloc(key_u64s * sizeof(u64), _gfp));
@@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
bkey_reassemble(ck->k, k);
+ ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
+ if (unlikely(ret))
+ goto err;
+
ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+
+ bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
+
if (unlikely(ret)) /* raced with another fill? */
goto err;
atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
+ enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
if (lock_want == SIX_LOCK_read)
six_lock_downgrade(&ck->c.lock);
- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
- path->uptodate = BTREE_ITER_UPTODATE;
+ btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
+ ck_path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
bkey_cached_free(bc, ck);
- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
return ret;
}
@@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
int ret;
bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
+ BTREE_ITER_intent|
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
@@ -306,9 +316,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
if (unlikely(ret))
goto out;
- ret = btree_key_cache_create(trans, ck_path, k);
+ ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
if (ret)
goto err;
+
+ if (trace_key_cache_fill_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, ck_path->pos);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ trace_key_cache_fill(trans, buf.buf);
+ printbuf_exit(&buf);
+ }
out:
/* We're not likely to need this iterator again: */
bch2_set_btree_iter_dontneed(&iter);
@@ -593,8 +613,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- path->should_be_locked = false;
+
+ struct btree_path *path2;
+ unsigned i;
+ trans_for_each_path(trans, path2, i)
+ if (path2->l[0].b == (void *) ck) {
+ __bch2_btree_path_unlock(trans, path2);
+ path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
+ path2->should_be_locked = false;
+ btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
+ }
+
+ bch2_trans_verify_locks(trans);
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index d343df9f..85039314 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -818,6 +818,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
}
+void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path(trans, path, i)
+ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_write_locked(path, l))
+ bch2_btree_node_unlock_write(trans, path, path->l[l].b);
+}
+
int __bch2_trans_mutex_lock(struct btree_trans *trans,
struct mutex *lock)
{
@@ -856,6 +867,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
(want == BTREE_NODE_UNLOCKED ||
have != BTREE_NODE_WRITE_LOCKED) &&
want != have);
+
+ BUG_ON(btree_node_locked(path, l) &&
+ path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
}
}
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 7474ab6c..b54ef48e 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -16,6 +16,7 @@
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
void bch2_trans_unlock_noassert(struct btree_trans *);
+void bch2_trans_unlock_write(struct btree_trans *);
static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
@@ -75,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path,
path->nodes_locked |= (type + 1) << (level << 1);
}
-static inline void mark_btree_node_unlocked(struct btree_path *path,
- unsigned level)
-{
- EBUG_ON(btree_node_write_locked(path, level));
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-}
-
static inline void mark_btree_node_locked(struct btree_trans *trans,
struct btree_path *path,
unsigned level,
@@ -124,19 +118,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
/* unlock: */
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
static inline void btree_node_unlock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
if (lock_type != BTREE_NODE_UNLOCKED) {
+ if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[level].b);
+ lock_type = BTREE_NODE_INTENT_LOCKED;
+ }
six_unlock_type(&path->l[level].b->c.lock, lock_type);
btree_trans_lock_hold_time_update(trans, path, level);
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
}
- mark_btree_node_unlocked(path, level);
}
static inline int btree_path_lowest_level_locked(struct btree_path *path)
@@ -163,27 +163,31 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
* succeed:
*/
static inline void
+__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
+{
+ if (!b->c.lock.write_lock_recurse) {
+ struct btree_path *linked;
+ unsigned i;
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ linked->l[b->c.level].lock_seq++;
+ }
+
+ six_unlock_write(&b->c.lock);
+}
+
+static inline void
bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
struct btree *b)
{
- struct btree_path *linked;
- unsigned i;
-
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- linked->l[b->c.level].lock_seq++;
-
- six_unlock_write(&b->c.lock);
+ __bch2_btree_node_unlock_write(trans, b);
}
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
/* lock: */
diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c
index a5282ff1..a7f06dee 100644
--- a/libbcachefs/btree_node_scan.c
+++ b/libbcachefs/btree_node_scan.c
@@ -152,7 +152,7 @@ static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
swap(*l, *r);
}
-const struct min_heap_callbacks found_btree_node_heap_cbs = {
+static const struct min_heap_callbacks found_btree_node_heap_cbs = {
.less = found_btree_node_cmp_pos_less,
.swp = found_btree_node_swap,
};
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 9011cc3f..6b79b672 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
return 0;
}
-static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
trans_for_each_update(trans, i)
@@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new |= 1 << BTREE_NODE_need_write;
} while (!try_cmpxchg(&b->flags, &old, new));
- btree_node_write_if_need(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(trans, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
bch2_trans_put(trans);
@@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct bkey_i *new_k;
int ret;
- bch2_trans_unlock_write(trans);
+ bch2_trans_unlock_updates_write(trans);
bch2_trans_unlock(trans);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
@@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
old, flags);
}
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
- bool overwrite)
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
{
verify_update_old_key(trans, i);
@@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
BTREE_TRIGGER_insert|
BTREE_TRIGGER_overwrite|flags) ?: 1;
- } else if (overwrite && !i->overwrite_trigger_run) {
+ } else if (!i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
- } else if (!overwrite && !i->insert_trigger_run) {
+ } else if (!i->insert_trigger_run) {
i->insert_trigger_run = true;
return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
} else {
@@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- unsigned btree_id_start)
+ unsigned *btree_id_updates_start)
{
- for (int overwrite = 1; overwrite >= 0; --overwrite) {
- bool trans_trigger_run;
+ bool trans_trigger_run;
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (unsigned i = btree_id_start;
- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
- i++) {
- if (trans->updates[i].btree_id != btree_id)
- continue;
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
- int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
+ for (unsigned i = *btree_id_updates_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
+ i++) {
+ if (trans->updates[i].btree_id < btree_id) {
+ *btree_id_updates_start = i;
+ continue;
}
- } while (trans_trigger_run);
- }
+
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
+
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
+ i->btree_id == btree_id &&
+ btree_node_type_has_trans_triggers(i->bkey_type) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
return 0;
}
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- unsigned btree_id = 0, btree_id_start = 0;
+ unsigned btree_id = 0, btree_id_updates_start = 0;
int ret = 0;
/*
@@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->nr_updates &&
- trans->updates[btree_id_start].btree_id < btree_id)
- btree_id_start++;
-
- ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
if (ret)
return ret;
}
- for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
- struct btree_insert_entry *i = trans->updates + idx;
-
- if (i->btree_id > BTREE_ID_alloc)
- break;
- if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
- if (ret)
- return ret;
- break;
- }
- }
+ btree_id_updates_start = 0;
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
+ if (ret)
+ return ret;
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
@@ -875,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
- bch2_trans_unlock_write(trans);
+ bch2_trans_unlock_updates_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index baab5288..a6f251eb 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -790,53 +790,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type)
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
}
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
+static inline bool btree_id_is_extents(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(type) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_id_is_extents(enum btree_id btree)
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+ return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id btree)
{
- return btree_node_type_is_extents(__btree_node_type(0, btree));
+ const u64 mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_snapshots(enum btree_id id)
+static inline bool btree_type_has_snapshot_field(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(id) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_snapshot_field(enum btree_id id)
+static inline bool btree_type_has_ptrs(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(id) & mask;
+ return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_ptrs(enum btree_id id)
+static inline bool btree_type_uses_write_buffer(enum btree_id btree)
{
const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(id) & mask;
+ return BIT_ULL(btree) & mask;
}
struct btree_root {
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index b1da6dba..13d794f2 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -823,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
return bch2_trans_update_buffered(trans, btree, &k);
}
-static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
{
+ unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
+ prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
+
+ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ return ret;
+
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- int ret = PTR_ERR_OR_ZERO(e);
+ ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
@@ -862,7 +869,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
- __bch2_trans_log_msg(trans, &buf, u64s));
+ bch2_trans_log_msg(trans, &buf));
}
err:
printbuf_exit(&buf);
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 58df2019..8f22ef9a 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -159,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned);
+int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 7d9dab95..f4aeadbe 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -238,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = trans->c;
- unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
@@ -249,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&b->c.lock);
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
+ bch2_trans_node_drop(trans, b);
}
static void bch2_btree_node_free_never_used(struct btree_update *as,
@@ -264,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
- struct btree_path *path;
- unsigned i, level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -287,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
six_unlock_intent(&b->c.lock);
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
+ bch2_trans_node_drop(trans, b);
}
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
@@ -803,7 +792,7 @@ err:
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
six_unlock_write(&b->c.lock);
- btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ btree_node_write_if_need(trans, b, SIX_LOCK_intent);
btree_node_unlock(trans, path, b->c.level);
bch2_path_put(trans, path_idx, true);
}
@@ -824,7 +813,7 @@ err:
b = as->new_nodes[i];
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- btree_node_write_if_need(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(trans, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@@ -1709,14 +1698,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
- bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
}
if (n2) {
bch2_btree_update_get_open_buckets(as, n2);
- bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
}
bch2_btree_update_get_open_buckets(as, n1);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
/*
* The old node must be freed (in memory) _before_ unlocking the new
@@ -1911,7 +1900,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
BUG_ON(ret);
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_trans_node_add(trans, path, n);
six_unlock_intent(&n->c.lock);
@@ -2104,7 +2093,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, trans->paths + path, b);
bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
@@ -2181,7 +2170,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
@@ -2291,7 +2280,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
bool now = false, pending = false;
spin_lock(&c->btree_node_rewrites_lock);
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
list_add(&a->list, &c->btree_node_rewrites);
now = true;
} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c
index 49ce2d1e..b56c4987 100644
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@@ -312,6 +312,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+ BUG_ON(!btree_type_uses_write_buffer(k->btree));
+
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);
@@ -632,6 +634,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+ if (trace_write_buffer_maybe_flush_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, referring_k);
+ trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
bch2_bkey_buf_reassemble(&tmp, c, referring_k);
if (bkey_is_btree_ptr(referring_k.k)) {
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 1e43ece2..345b117a 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -262,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
- percpu_down_read(&c->mark_lock);
-
bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
if (ret)
@@ -364,7 +362,6 @@ found:
bch_info(c, "new key %s", buf.buf);
}
- percpu_up_read(&c->mark_lock);
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
@@ -373,8 +370,6 @@ found:
BTREE_UPDATE_internal_snapshot_node|
BTREE_TRIGGER_norun);
bch2_trans_iter_exit(trans, &iter);
- percpu_down_read(&c->mark_lock);
-
if (ret)
goto err;
@@ -382,7 +377,6 @@ found:
bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
}
err:
- percpu_up_read(&c->mark_lock);
printbuf_exit(&buf);
return ret;
}
@@ -547,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
const struct extent_ptr_decoded *p,
s64 sectors, enum bch_data_type ptr_data_type,
- struct bch_alloc_v4 *a)
+ struct bch_alloc_v4 *a,
+ bool insert)
{
u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
!p->ptr.cached ? &a->dirty_sectors :
@@ -557,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
if (ret)
return ret;
-
- alloc_data_type_set(a, ptr_data_type);
+ if (insert)
+ alloc_data_type_set(a, ptr_data_type);
return 0;
}
@@ -591,7 +586,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v);
+ __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
if (ret)
goto err;
@@ -603,22 +598,19 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
}
if (flags & BTREE_TRIGGER_gc) {
- percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = -BCH_ERR_trigger_pointer;
- goto err_unlock;
+ goto err;
}
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new);
+ ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
alloc_to_bucket(g, new);
bucket_unlock(g);
-err_unlock:
- percpu_up_read(&c->mark_lock);
if (!ret)
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
@@ -996,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
struct bch_fs *c = trans->c;
int ret = 0;
- percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, b);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
ca->dev_idx, bch2_data_type_str(data_type)))
- goto err_unlock;
+ goto err;
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
@@ -1010,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
bch2_data_type_str(data_type)))
- goto err;
+ goto err_unlock;
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors))
- goto err;
+ goto err_unlock;
g->data_type = data_type;
g->dirty_sectors += sectors;
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
return ret;
-err:
- bucket_unlock(g);
err_unlock:
- percpu_up_read(&c->mark_lock);
+ bucket_unlock(g);
+err:
return -BCH_ERR_metadata_bucket_inconsistency;
}
@@ -1269,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
for_each_member_device(c, ca) {
BUG_ON(ca->buckets_nouse);
- ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
+ ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets_nouse) {
@@ -1295,10 +1284,14 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
- BUG_ON(resize && ca->buckets_nouse);
+ if (resize)
+ lockdep_assert_held(&c->state_lock);
+
+ if (resize && ca->buckets_nouse)
+ return -BCH_ERR_no_resize_with_buckets_nouse;
- bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets),
- GFP_KERNEL|__GFP_ZERO);
+ bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
+ GFP_KERNEL|__GFP_ZERO);
if (!bucket_gens) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
@@ -1309,11 +1302,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->nbuckets_minus_first =
bucket_gens->nbuckets - bucket_gens->first_bucket;
- if (resize) {
- down_write(&ca->bucket_lock);
- percpu_down_write(&c->mark_lock);
- }
-
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
@@ -1331,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
nbuckets = ca->mi.nbuckets;
- if (resize) {
- percpu_up_write(&c->mark_lock);
- up_write(&ca->bucket_lock);
- }
-
ret = 0;
err:
if (bucket_gens)
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 3bebc4c3..a9acdd6c 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b)
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- return genradix_ptr(&ca->buckets_gc, b);
+ return bucket_valid(ca, b)
+ ? genradix_ptr(&ca->buckets_gc, b)
+ : NULL;
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
{
return rcu_dereference_check(ca->bucket_gens,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->state_lock) ||
- lockdep_is_held(&ca->bucket_lock));
+ lockdep_is_held(&ca->fs->state_lock));
}
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index 1d6b691e..1f8e035d 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -14,21 +14,13 @@ static inline bool io_timer_cmp(const void *l, const void *r, void __always_unus
return (*_l)->expire < (*_r)->expire;
}
-static inline void io_timer_swp(void *l, void *r, void __always_unused *args)
-{
- struct io_timer **_l = (struct io_timer **)l;
- struct io_timer **_r = (struct io_timer **)r;
-
- swap(*_l, *_r);
-}
+static const struct min_heap_callbacks callbacks = {
+ .less = io_timer_cmp,
+ .swp = NULL,
+};
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
{
- const struct min_heap_callbacks callbacks = {
- .less = io_timer_cmp,
- .swp = io_timer_swp,
- };
-
spin_lock(&clock->timer_lock);
if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
@@ -48,11 +40,6 @@ out:
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
{
- const struct min_heap_callbacks callbacks = {
- .less = io_timer_cmp,
- .swp = io_timer_swp,
- };
-
spin_lock(&clock->timer_lock);
for (size_t i = 0; i < clock->timers.nr; i++)
@@ -142,10 +129,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
{
struct io_timer *ret = NULL;
- const struct min_heap_callbacks callbacks = {
- .less = io_timer_cmp,
- .swp = io_timer_swp,
- };
if (clock->timers.nr &&
time_after_eq64(now, clock->timers.data[0]->expire)) {
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 31b2aeb0..58521493 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -620,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* and we have to check for this because we go rw before repairing the
* snapshots table - just skip it, we can move it later.
*/
- if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
return -BCH_ERR_data_update_done;
if (!bkey_get_dev_refs(c, k))
diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c
index a0e49cbe..b32e91ba 100644
--- a/libbcachefs/disk_accounting.c
+++ b/libbcachefs/disk_accounting.c
@@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_
memcpy_u64s_small(acc->v.d, d, nr);
}
+static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
+
int bch2_disk_accounting_mod(struct btree_trans *trans,
struct disk_accounting_pos *k,
s64 *d, unsigned nr, bool gc)
@@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
accounting_key_init(&k_i.k, k, d, nr);
- return likely(!gc)
- ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
- : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ if (unlikely(gc)) {
+ int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
+ ret = drop_locks_do(trans,
+ bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
+ bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+ return ret;
+ } else {
+ return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
+ }
}
int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
@@ -471,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
return ret;
}
-void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- percpu_down_read(&c->mark_lock);
- out->atomic++;
-
- eytzinger0_for_each(i, acc->k.nr) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
-
- bch2_accounting_key_to_text(out, &acc_k);
-
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
- prt_str(out, ":");
- for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
- prt_printf(out, " %llu", v[j]);
- prt_newline(out);
- }
-
- --out->atomic;
- percpu_up_read(&c->mark_lock);
-}
-
static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
{
darray_for_each(acc->k, e) {
@@ -931,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- continue;
+ break;
- if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+ if (!bch2_accounting_is_mem(acc_k)) {
+ struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
continue;
+ }
bch2_accounting_mem_read(c, k.k->p, v, nr);
diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h
index 4a3d9ff8..5360cbb3 100644
--- a/libbcachefs/disk_accounting.h
+++ b/libbcachefs/disk_accounting.h
@@ -138,7 +138,8 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
bool gc = mode == BCH_ACCOUNTING_gc;
- EBUG_ON(gc && !acc->gc_running);
+ if (gc && !acc->gc_running)
+ return 0;
if (!bch2_accounting_is_mem(acc_k))
return 0;
@@ -255,7 +256,6 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
int bch2_gc_accounting_start(struct bch_fs *);
int bch2_gc_accounting_done(struct bch_fs *);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 6b297f90..d2a5e76e 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -305,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
}
if (flags & BTREE_TRIGGER_gc) {
- percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -BCH_ERR_mark_stripe;
- goto err_unlock;
+ goto err;
}
bucket_lock(g);
@@ -319,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
alloc_to_bucket(g, new);
bucket_unlock(g);
-err_unlock:
- percpu_up_read(&c->mark_lock);
+
if (!ret)
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
}
@@ -1058,6 +1056,11 @@ static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
ec_stripes_heap_set_backpointer(_h, j);
}
+static const struct min_heap_callbacks callbacks = {
+ .less = ec_stripes_heap_cmp,
+ .swp = ec_stripes_heap_swap,
+};
+
static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
@@ -1070,11 +1073,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
- };
-
mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
@@ -1085,11 +1083,6 @@ void bch2_stripes_heap_del(struct bch_fs *c,
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
- };
-
mutex_lock(&c->ec_stripes_heap_lock);
BUG_ON(min_heap_full(&c->ec_stripes_heap));
@@ -1108,10 +1101,6 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
- };
ec_stripes_heap *h = &c->ec_stripes_heap;
bool do_deletes;
size_t i;
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 775425df..4590cd0c 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -118,6 +118,7 @@
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOENT, ENOENT_inode_no_backpointer) \
+ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
@@ -196,6 +197,9 @@
x(EINVAL, opt_parse_error) \
x(EINVAL, remove_with_metadata_missing_unimplemented)\
x(EINVAL, remove_would_lose_data) \
+ x(EINVAL, no_resize_with_buckets_nouse) \
+ x(EINVAL, inode_unpack_error) \
+ x(EINVAL, varint_decode_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -313,6 +317,7 @@ static inline long bch2_err_class(long err)
#define BLK_STS_REMOVED ((__force blk_status_t)128)
+#include <linux/blk_types.h>
const char *bch2_blk_status_to_str(blk_status_t);
#endif /* _BCACHFES_ERRCODE_H */
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 7d279f21..2c3d46ac 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -574,6 +574,11 @@ static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsig
printbuf_nul_terminate(out);
}
+static inline void prt_str_reversed(struct printbuf *out, const char *s)
+{
+ prt_bytes_reversed(out, s, strlen(s));
+}
+
static inline void reverse_bytes(void *b, size_t n)
{
char *e = b + n, *s = b;
@@ -596,17 +601,20 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
struct bch_inode_unpacked inode;
ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
if (ret)
- goto err;
+ goto disconnected;
if (!inode.bi_dir && !inode.bi_dir_offset) {
ret = -BCH_ERR_ENOENT_inode_no_backpointer;
- goto err;
+ goto disconnected;
}
+ inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
+ inum.inum = inode.bi_dir;
+
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- goto err;
+ goto disconnected;
struct btree_iter d_iter;
struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
@@ -614,23 +622,19 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
0, dirent);
ret = bkey_err(d.s_c);
if (ret)
- goto err;
+ goto disconnected;
struct qstr dirent_name = bch2_dirent_get_name(d);
prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
prt_char(path, '/');
- if (d.v->d_type == DT_SUBVOL)
- inum.subvol = le32_to_cpu(d.v->d_parent_subvol);
- inum.inum = d.k->p.inode;
-
bch2_trans_iter_exit(trans, &d_iter);
}
if (orig_pos == path->pos)
prt_char(path, '/');
-
+out:
ret = path->allocation_failure ? -ENOMEM : 0;
if (ret)
goto err;
@@ -639,4 +643,10 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
return 0;
err:
return ret;
+disconnected:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto err;
+
+ prt_str_reversed(path, "(disconnected)");
+ goto out;
}
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index ff8b8df5..ab1d5db2 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -625,15 +625,6 @@ do_io:
BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
sectors << 9, offset << 9));
- /* Check for writing past i_size: */
- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags),
- "writing past i_size: %llu > %llu (unrounded %llu)\n",
- bio_end_sector(&w->io->op.wbio.bio) << 9,
- round_up(i_size, block_bytes(c)),
- i_size);
-
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
w->io->op.new_i_size = i_size;
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index a436c072..206fc046 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -205,6 +205,36 @@ err:
return ret;
}
+/*
+ * Find any subvolume associated with a tree of snapshots
+ * We can't rely on master_subvol - it might have been deleted.
+ */
+static int find_snapshot_tree_subvol(struct btree_trans *trans,
+ u32 tree_id, u32 *subvol)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+ if (le32_to_cpu(s.v->tree) != tree_id)
+ continue;
+
+ if (s.v->subvol) {
+ *subvol = le32_to_cpu(s.v->subvol);
+ goto found;
+ }
+ }
+ ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked *lostfound,
@@ -223,19 +253,24 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
if (ret)
return ret;
- subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+ u32 subvolid;
+ ret = find_snapshot_tree_subvol(trans,
+ bch2_snapshot_tree(c, snapshot), &subvolid);
+ bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
+ bch2_snapshot_tree(c, snapshot));
+ if (ret)
+ return ret;
struct bch_subvolume subvol;
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol);
- bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
- le32_to_cpu(st.master_subvol), snapshot);
+ ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
+ bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
if (ret)
return ret;
if (!subvol.inode) {
struct btree_iter iter;
struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ BTREE_ID_subvolumes, POS(0, subvolid),
0, subvolume);
ret = PTR_ERR_OR_ZERO(subvol);
if (ret)
@@ -245,13 +280,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
bch2_trans_iter_exit(trans, &iter);
}
- root_inum.inum = le64_to_cpu(subvol.inode);
+ subvol_inum root_inum = {
+ .subvol = subvolid,
+ .inum = le64_to_cpu(subvol.inode)
+ };
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
- root_inum.inum, le32_to_cpu(st.master_subvol));
+ root_inum.inum, subvolid);
if (ret)
return ret;
@@ -458,7 +496,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
continue;
struct bch_inode_unpacked child_inode;
- bch2_inode_unpack(k, &child_inode);
+ ret = bch2_inode_unpack(k, &child_inode);
+ if (ret)
+ break;
if (!inode_should_reattach(&child_inode)) {
ret = maybe_delete_dirent(trans,
@@ -809,9 +849,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
{
struct bch_inode_unpacked u;
- BUG_ON(bch2_inode_unpack(inode, &u));
-
- return darray_push(&w->inodes, ((struct inode_walker_entry) {
+ return bch2_inode_unpack(inode, &u) ?:
+ darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
.snapshot = inode.k->p.snapshot,
}));
@@ -1065,7 +1104,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans,
goto err;
BUG();
found_root:
- BUG_ON(bch2_inode_unpack(k, root));
+ ret = bch2_inode_unpack(k, root);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1096,7 +1135,9 @@ static int check_inode(struct btree_trans *trans,
if (!bkey_is_inode(k.k))
return 0;
- BUG_ON(bch2_inode_unpack(k, &u));
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ goto err;
if (snapshot_root->bi_inum != u.bi_inum) {
ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
@@ -1107,7 +1148,7 @@ static int check_inode(struct btree_trans *trans,
if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
trans, inode_snapshot_mismatch,
- "inodes in different snapshots don't match")) {
+ "inode hash info in different snapshots don't match")) {
u.bi_hash_seed = snapshot_root->bi_hash_seed;
SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
do_update = true;
@@ -1318,7 +1359,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
break;
struct bch_inode_unpacked parent_inode;
- bch2_inode_unpack(k, &parent_inode);
+ ret = bch2_inode_unpack(k, &parent_inode);
+ if (ret)
+ break;
if (!inode_should_reattach(&parent_inode))
break;
@@ -1341,7 +1384,9 @@ static int check_unreachable_inode(struct btree_trans *trans,
return 0;
struct bch_inode_unpacked inode;
- BUG_ON(bch2_inode_unpack(k, &inode));
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ return ret;
if (!inode_should_reattach(&inode))
return 0;
@@ -2296,7 +2341,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
goto err;
if (ret) {
@@ -2410,7 +2455,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
+ ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
bch_err_fn(c, ret);
return ret;
}
@@ -2653,7 +2698,9 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
int ret = 0;
struct bch_inode_unpacked inode;
- BUG_ON(bch2_inode_unpack(inode_k, &inode));
+ ret = bch2_inode_unpack(inode_k, &inode);
+ if (ret)
+ return ret;
while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
@@ -2864,7 +2911,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
/* Should never fail, checked by bch2_inode_invalid: */
struct bch_inode_unpacked u;
- BUG_ON(bch2_inode_unpack(k, &u));
+ _ret3 = bch2_inode_unpack(k, &u);
+ if (_ret3)
+ break;
/*
* Backpointer and directory structure checks are sufficient for
@@ -2942,7 +2991,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
if (!bkey_is_inode(k.k))
return 0;
- BUG_ON(bch2_inode_unpack(k, &u));
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return ret;
if (S_ISDIR(u.bi_mode))
return 0;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index d1fc44a7..04ec0520 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -48,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
u8 *p;
if (in >= end)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
if (!*in)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
/*
* position of highest set bit indicates number of bytes:
@@ -61,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end,
bytes = byte_table[shift - 1];
if (in + bytes > end)
- return -1;
+ return -BCH_ERR_inode_unpack_error;
p = (u8 *) be + 16 - bytes;
memcpy(p, in, bytes);
@@ -177,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
return ret; \
\
if (field_bits > sizeof(unpacked->_name) * 8) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
\
unpacked->_name = field[1]; \
in += ret;
@@ -218,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
\
unpacked->_name = v[0]; \
if (v[1] || v[0] != unpacked->_name) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
fieldnr++;
BCH_INODE_FIELDS_v2()
@@ -269,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
\
unpacked->_name = v[0]; \
if (v[1] || v[0] != unpacked->_name) \
- return -1; \
+ return -BCH_ERR_inode_unpack_error; \
fieldnr++;
BCH_INODE_FIELDS_v3()
@@ -886,7 +886,7 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m
{
struct bch_fs *c = trans->c;
- u64 cursor_idx = c->opts.shard_inode_numbers ? cpu : 0;
+ u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
@@ -907,19 +907,16 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m
if (ret)
goto err;
- cursor->v.bits = c->opts.shard_inode_numbers_bits;
+ if (c->opts.inodes_32bit) {
+ *min = BLOCKDEV_INODE_MAX;
+ *max = INT_MAX;
+ } else {
+ cursor->v.bits = c->opts.shard_inode_numbers_bits;
- unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
- if (c->opts.shard_inode_numbers) {
- bits -= cursor->v.bits;
+ unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
- *min = (cpu << bits);
+ *min = max(cpu << bits, (u64) INT_MAX + 1);
*max = (cpu << bits) | ~(ULLONG_MAX << bits);
-
- *min = max_t(u64, *min, BLOCKDEV_INODE_MAX);
- } else {
- *min = BLOCKDEV_INODE_MAX;
- *max = ~(ULLONG_MAX << bits);
}
if (le64_to_cpu(cursor->v.idx) < *min)
diff --git a/libbcachefs/inode_format.h b/libbcachefs/inode_format.h
index 1b93e189..b99a5bf1 100644
--- a/libbcachefs/inode_format.h
+++ b/libbcachefs/inode_format.h
@@ -102,7 +102,8 @@ struct bch_inode_generation {
x(bi_subvol, 32) \
x(bi_parent_subvol, 32) \
x(bi_nocow, 8) \
- x(bi_depth, 32)
+ x(bi_depth, 32) \
+ x(bi_inodes_32bit, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -115,7 +116,8 @@ struct bch_inode_generation {
x(foreground_target, 16) \
x(background_target, 16) \
x(erasure_code, 16) \
- x(nocow, 8)
+ x(nocow, 8) \
+ x(inodes_32bit, 8)
enum inode_opt_id {
#define x(name, ...) \
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index 20da357e..3e71860f 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -1356,6 +1356,9 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ bch2_trans_put(trans);
+ darray_exit(&buckets);
+
if (ret) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
@@ -1366,9 +1369,6 @@ err:
op->flags |= BCH_WRITE_SUBMITTED;
}
- bch2_trans_put(trans);
- darray_exit(&buckets);
-
/* fallback to cow write path? */
if (!(op->flags & BCH_WRITE_SUBMITTED)) {
closure_sync(&op->cl);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 7f2efe85..e1773ac2 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1114,8 +1114,10 @@ reread:
(printbuf_reset(&err),
prt_str(&err, "journal "),
bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf)))
+ err.buf))) {
saw_bad = true;
+ bch2_fatal_error(c);
+ }
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 9b066c61..c493ea62 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -414,7 +414,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
continue;
struct bch_inode_unpacked inode;
- BUG_ON(bch2_inode_unpack(k, &inode));
+ _ret3 = bch2_inode_unpack(k, &inode);
+ if (_ret3)
+ break;
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 2df5c8f7..e763d52e 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -222,15 +222,10 @@ enum fsck_err_opts {
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(inodes_32bit, u8, \
- OPT_FS|OPT_FORMAT, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
- x(shard_inode_numbers, u8, \
- OPT_FS|OPT_FORMAT, \
- OPT_BOOL(), \
- BCH_SB_SHARD_INUMS, true, \
- NULL, "Shard new inode numbers by CPU id") \
x(shard_inode_numbers_bits, u8, \
OPT_FS|OPT_FORMAT, \
OPT_UINT(0, 8), \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 7849916e..98825437 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -107,6 +107,12 @@ out:
return ret;
}
+static void kill_btree(struct bch_fs *c, enum btree_id btree)
+{
+ bch2_btree_id_root(c, btree)->alive = false;
+ bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+}
+
/* for -o reconstruct_alloc: */
static void bch2_reconstruct_alloc(struct bch_fs *c)
{
@@ -157,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
+ if (btree_id_is_alloc(i))
+ kill_btree(c, i);
}
/*
@@ -573,9 +572,6 @@ static int read_btree_roots(struct bch_fs *c)
if (!r->alive)
continue;
- if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
- continue;
-
printbuf_reset(&buf);
bch2_btree_id_level_to_text(&buf, i, r->level);
@@ -785,6 +781,11 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
+ SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
+ write_sb = true;
+ }
+
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -882,15 +883,15 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
- if (c->opts.reconstruct_alloc)
- bch2_reconstruct_alloc(c);
-
zero_out_btree_mem_ptr(&c->journal_keys);
ret = journal_replay_early(c, clean);
if (ret)
goto err;
+ if (c->opts.reconstruct_alloc)
+ bch2_reconstruct_alloc(c);
+
/*
* After an unclean shutdown, skip then next few journal sequence
* numbers as they may have been referenced by btree writes that
diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h
index f967b23d..71baad41 100644
--- a/libbcachefs/recovery_passes_types.h
+++ b/libbcachefs/recovery_passes_types.h
@@ -49,7 +49,7 @@
x(fs_upgrade_for_subvolumes, 22, 0) \
x(check_inodes, 24, PASS_FSCK) \
x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_FSCK) \
+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
index 62ea4782..fdcf598f 100644
--- a/libbcachefs/sb-counters_format.h
+++ b/libbcachefs/sb-counters_format.h
@@ -2,86 +2,91 @@
#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76) \
- x(write_buffer_flush_slowpath, 77) \
- x(write_buffer_flush_sync, 78)
+enum counters_flags {
+ TYPE_COUNTER = BIT(0), /* event counters */
+ TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */
+};
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0, TYPE_SECTORS) \
+ x(io_write, 1, TYPE_SECTORS) \
+ x(io_move, 2, TYPE_SECTORS) \
+ x(bucket_invalidate, 3, TYPE_COUNTER) \
+ x(bucket_discard, 4, TYPE_COUNTER) \
+ x(bucket_alloc, 5, TYPE_COUNTER) \
+ x(bucket_alloc_fail, 6, TYPE_COUNTER) \
+ x(btree_cache_scan, 7, TYPE_COUNTER) \
+ x(btree_cache_reap, 8, TYPE_COUNTER) \
+ x(btree_cache_cannibalize, 9, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \
+ x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \
+ x(btree_node_write, 13, TYPE_COUNTER) \
+ x(btree_node_read, 14, TYPE_COUNTER) \
+ x(btree_node_compact, 15, TYPE_COUNTER) \
+ x(btree_node_merge, 16, TYPE_COUNTER) \
+ x(btree_node_split, 17, TYPE_COUNTER) \
+ x(btree_node_rewrite, 18, TYPE_COUNTER) \
+ x(btree_node_alloc, 19, TYPE_COUNTER) \
+ x(btree_node_free, 20, TYPE_COUNTER) \
+ x(btree_node_set_root, 21, TYPE_COUNTER) \
+ x(btree_path_relock_fail, 22, TYPE_COUNTER) \
+ x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \
+ x(btree_reserve_get_fail, 24, TYPE_COUNTER) \
+ x(journal_entry_full, 25, TYPE_COUNTER) \
+ x(journal_full, 26, TYPE_COUNTER) \
+ x(journal_reclaim_finish, 27, TYPE_COUNTER) \
+ x(journal_reclaim_start, 28, TYPE_COUNTER) \
+ x(journal_write, 29, TYPE_COUNTER) \
+ x(read_promote, 30, TYPE_COUNTER) \
+ x(read_bounce, 31, TYPE_COUNTER) \
+ x(read_split, 33, TYPE_COUNTER) \
+ x(read_retry, 32, TYPE_COUNTER) \
+ x(read_reuse_race, 34, TYPE_COUNTER) \
+ x(move_extent_read, 35, TYPE_SECTORS) \
+ x(move_extent_write, 36, TYPE_SECTORS) \
+ x(move_extent_finish, 37, TYPE_SECTORS) \
+ x(move_extent_fail, 38, TYPE_COUNTER) \
+ x(move_extent_start_fail, 39, TYPE_COUNTER) \
+ x(copygc, 40, TYPE_COUNTER) \
+ x(copygc_wait, 41, TYPE_COUNTER) \
+ x(gc_gens_end, 42, TYPE_COUNTER) \
+ x(gc_gens_start, 43, TYPE_COUNTER) \
+ x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \
+ x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \
+ x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \
+ x(trans_restart_fault_inject, 47, TYPE_COUNTER) \
+ x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \
+ x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \
+ x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \
+ x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \
+ x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \
+ x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \
+ x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \
+ x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \
+ x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \
+ x(trans_restart_relock, 57, TYPE_COUNTER) \
+ x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \
+ x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \
+ x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \
+ x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \
+ x(trans_restart_relock_path, 62, TYPE_COUNTER) \
+ x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \
+ x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \
+ x(trans_restart_traverse, 65, TYPE_COUNTER) \
+ x(trans_restart_upgrade, 66, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \
+ x(trans_restart_injected, 69, TYPE_COUNTER) \
+ x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \
+ x(trans_traverse_all, 71, TYPE_COUNTER) \
+ x(transaction_commit, 72, TYPE_COUNTER) \
+ x(write_super, 73, TYPE_COUNTER) \
+ x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \
+ x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
+ x(trans_restart_split_race, 76, TYPE_COUNTER) \
+ x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
+ x(write_buffer_flush_sync, 78, TYPE_COUNTER)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index fe453e17..051214fd 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -83,7 +83,6 @@
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \
x(backpointer_bucket_gen, \
- BIT_ULL(BCH_RECOVERY_PASS_check_backpointers_to_extents)|\
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
BCH_FSCK_ERR_backpointer_to_missing_ptr, \
BCH_FSCK_ERR_ptr_to_missing_backpointer) \
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index 617d07e5..537bf049 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
if (type != SIX_LOCK_write)
six_release(&lock->dep_map, ip);
- else
- lock->seq++;
if (type == SIX_LOCK_intent &&
lock->intent_lock_recurse) {
@@ -625,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long
return;
}
+ if (type == SIX_LOCK_write &&
+ lock->write_lock_recurse) {
+ --lock->write_lock_recurse;
+ return;
+ }
+
+ if (type == SIX_LOCK_write)
+ lock->seq++;
+
do_six_unlock_type(lock, type);
}
EXPORT_SYMBOL_GPL(six_unlock_ip);
@@ -735,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
atomic_add(l[type].lock_val, &lock->state);
}
break;
+ case SIX_LOCK_write:
+ lock->write_lock_recurse++;
+ fallthrough;
case SIX_LOCK_intent:
EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
lock->intent_lock_recurse++;
break;
- case SIX_LOCK_write:
- BUG();
- break;
}
}
EXPORT_SYMBOL_GPL(six_lock_increment);
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
index 68d46fd7..c142e06b 100644
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@@ -137,6 +137,7 @@ struct six_lock {
atomic_t state;
u32 seq;
unsigned intent_lock_recurse;
+ unsigned write_lock_recurse;
struct task_struct *owner;
unsigned __percpu *readers;
raw_spinlock_t wait_lock;
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
index 99f04551..cf6b3256 100644
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "btree_cache.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "buckets.h"
@@ -279,23 +280,6 @@ fsck_err:
return ret;
}
-static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
- struct snapshot_t *t = snapshot_t_mut(c, id);
- u32 parent = id;
-
- while ((parent = bch2_snapshot_parent_early(c, parent)) &&
- parent - id - 1 < IS_ANCESTOR_BITMAP)
- __set_bit(parent - id - 1, t->is_ancestor);
-}
-
-static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
-{
- mutex_lock(&c->snapshot_table_lock);
- __set_is_ancestor_bitmap(c, id);
- mutex_unlock(&c->snapshot_table_lock);
-}
-
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@@ -317,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+ t->live = true;
t->parent = le32_to_cpu(s.v->parent);
t->children[0] = le32_to_cpu(s.v->children[0]);
t->children[1] = le32_to_cpu(s.v->children[1]);
@@ -335,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
t->skip[2] = 0;
}
- __set_is_ancestor_bitmap(c, id);
+ u32 parent = id;
+
+ while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+ parent - id - 1 < IS_ANCESTOR_BITMAP)
+ __set_bit(parent - id - 1, t->is_ancestor);
if (BCH_SNAPSHOT_DELETED(s.v)) {
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
@@ -365,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
BTREE_ITER_with_updates, snapshot, s);
}
-static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
-{
- struct bch_snapshot v;
- int ret;
-
- if (!id)
- return 0;
-
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot node %u not found", id);
- if (ret)
- return ret;
-
- return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-/*
- * If @k is a snapshot with just one live child, it's part of a linear chain,
- * which we consider to be an equivalence class: and then after snapshot
- * deletion cleanup, there should only be a single key at a given position in
- * this equivalence class.
- *
- * This sets the equivalence class of @k to be the child's equivalence class, if
- * it's part of such a linear chain: this correctly sets equivalence classes on
- * startup if we run leaf to root (i.e. in natural key order).
- */
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- unsigned i, nr_live = 0, live_idx = 0;
- struct bkey_s_c_snapshot snap;
- u32 id = k.k->p.offset, child[2];
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- snap = bkey_s_c_to_snapshot(k);
-
- child[0] = le32_to_cpu(snap.v->children[0]);
- child[1] = le32_to_cpu(snap.v->children[1]);
-
- for (i = 0; i < 2; i++) {
- int ret = bch2_snapshot_live(trans, child[i]);
-
- if (ret < 0)
- return ret;
-
- if (ret)
- live_idx = i;
- nr_live += ret;
- }
-
- mutex_lock(&c->snapshot_table_lock);
-
- snapshot_t_mut(c, id)->equiv = nr_live == 1
- ? snapshot_t_mut(c, child[live_idx])->equiv
- : id;
-
- mutex_unlock(&c->snapshot_table_lock);
-
- return 0;
-}
-
/* fsck: */
static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
@@ -570,6 +495,9 @@ static int check_snapshot_tree(struct btree_trans *trans,
goto err;
}
+ if (!st.v->master_subvol)
+ goto out;
+
ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -613,6 +541,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
u->v.master_subvol = cpu_to_le32(subvol_id);
st = snapshot_tree_i_to_s_c(u);
}
+out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &snapshot_iter);
@@ -913,7 +842,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- if (bch2_snapshot_equiv(c, id))
+ if (bch2_snapshot_exists(c, id))
return 0;
/* Do we need to reconstruct the snapshot_tree entry as well? */
@@ -962,8 +891,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
- bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
}
/* Figure out which snapshot nodes belong in the same tree: */
@@ -1061,7 +989,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
snapshot_id_list_to_text(&buf, t);
darray_for_each(*t, id) {
- if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ if (fsck_err_on(!bch2_snapshot_exists(c, *id),
trans, snapshot_node_missing,
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
@@ -1094,10 +1022,12 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
+ if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
trans, bkey_in_missing_snapshot,
"key in missing snapshot %s, delete?",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
+ prt_char(&buf, ' '),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node) ?: 1;
fsck_err:
@@ -1111,13 +1041,11 @@ fsck_err:
int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
{
struct btree_iter iter;
- struct bkey_i_snapshot *s;
- int ret = 0;
-
- s = bch2_bkey_get_mut_typed(trans, &iter,
+ struct bkey_i_snapshot *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_snapshots, POS(0, id),
0, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
+ int ret = PTR_ERR_OR_ZERO(s);
if (unlikely(ret)) {
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
trans->c, "missing snapshot %u", id);
@@ -1305,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
goto err;
new_snapids[i] = iter.pos.offset;
-
- mutex_lock(&c->snapshot_table_lock);
- snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
- mutex_unlock(&c->snapshot_table_lock);
}
err:
bch2_trans_iter_exit(trans, &iter);
@@ -1414,129 +1338,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *deleted,
- snapshot_id_list *equiv_seen,
- struct bpos *last_pos)
+struct snapshot_interior_delete {
+ u32 id;
+ u32 live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
{
- int ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ darray_for_each(*l, i)
+ if (i->id == id)
+ return i->live_child;
+ return 0;
+}
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
- if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
+static unsigned __live_child(struct snapshot_table *t, u32 id,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
return 0;
- if (!bkey_eq(k.k->p, *last_pos))
- equiv_seen->nr = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
+ if (s->children[i] &&
+ !snapshot_list_has_id(delete_leaves, s->children[i]) &&
+ !interior_delete_has_id(delete_interior, s->children[i]))
+ return s->children[i];
- if (snapshot_list_has_id(deleted, k.k->p.snapshot))
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
+ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
+ u32 live_child = s->children[i]
+ ? __live_child(t, s->children[i], delete_leaves, delete_interior)
+ : 0;
+ if (live_child)
+ return live_child;
+ }
- if (!bpos_eq(*last_pos, k.k->p) &&
- snapshot_list_has_id(equiv_seen, equiv))
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
+ return 0;
+}
- *last_pos = k.k->p;
+static unsigned live_child(struct bch_fs *c, u32 id,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ rcu_read_lock();
+ u32 ret = __live_child(rcu_dereference(c->snapshots), id,
+ delete_leaves, delete_interior);
+ rcu_read_unlock();
+ return ret;
+}
- ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
- if (ret)
- return ret;
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
+{
+ if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
- /*
- * When we have a linear chain of snapshot nodes, we consider
- * those to form an equivalence class: we're going to collapse
- * them all down to a single node, and keep the leaf-most node -
- * which has the same id as the equivalence class id.
- *
- * If there are multiple keys in different snapshots at the same
- * position, we're only going to keep the one in the newest
- * snapshot (we delete the others above) - the rest have been
- * overwritten and are redundant, and for the key we're going to keep we
- * need to move it to the equivalance class ID if it's not there
- * already.
- */
- if (equiv != k.k->p.snapshot) {
+ u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+ if (live_child) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
- new->k.p.snapshot = equiv;
-
- struct btree_iter new_iter;
- bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_cached|
- BTREE_ITER_intent);
+ new->k.p.snapshot = live_child;
- ret = bch2_btree_iter_traverse(&new_iter) ?:
- bch2_trans_update(trans, &new_iter, new,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &new_iter);
+ struct btree_iter dst_iter;
+ struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
+ iter->btree_id, new->k.p,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_intent);
+ ret = bkey_err(dst_k);
if (ret)
return ret;
+
+ ret = (bkey_deleted(dst_k.k)
+ ? bch2_trans_update(trans, &dst_iter, new,
+ BTREE_UPDATE_internal_snapshot_node)
+ : 0) ?:
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
}
return 0;
}
-static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
+ snapshot_id_list *delete_leaves,
+ interior_delete_list *delete_interior)
{
- struct bkey_s_c_snapshot snap;
- u32 children[2];
- int ret;
-
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- BCH_SNAPSHOT_SUBVOL(snap.v))
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+ unsigned live_children = 0;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v))
return 0;
- children[0] = le32_to_cpu(snap.v->children[0]);
- children[1] = le32_to_cpu(snap.v->children[1]);
+ for (unsigned i = 0; i < 2; i++) {
+ u32 child = le32_to_cpu(s.v->children[i]);
- ret = bch2_snapshot_live(trans, children[0]) ?:
- bch2_snapshot_live(trans, children[1]);
- if (ret < 0)
- return ret;
- return !ret;
-}
+ live_children += child &&
+ !snapshot_list_has_id(delete_leaves, child);
+ }
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
- int ret = bch2_snapshot_needs_delete(trans, k);
+ if (live_children == 0) {
+ return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+ } else if (live_children == 1) {
+ struct snapshot_interior_delete d = {
+ .id = s.k->p.offset,
+ .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+ };
+
+ if (!d.live_child) {
+ bch_err(c, "error finding live child of snapshot %u", d.id);
+ return -EINVAL;
+ }
- return ret <= 0
- ? ret
- : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+ return darray_push(delete_interior, d);
+ } else {
+ return 0;
+ }
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
- snapshot_id_list *skip)
+ interior_delete_list *skip)
{
rcu_read_lock();
- while (snapshot_list_has_id(skip, id))
+ while (interior_delete_has_id(skip, id))
id = __bch2_snapshot_parent(c, id);
while (n--) {
do {
id = __bch2_snapshot_parent(c, id);
- } while (snapshot_list_has_id(skip, id));
+ } while (interior_delete_has_id(skip, id));
}
rcu_read_unlock();
@@ -1545,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct btree_iter *iter, struct bkey_s_c k,
- snapshot_id_list *deleted)
+ interior_delete_list *deleted)
{
struct bch_fs *c = trans->c;
u32 nr_deleted_ancestors = 0;
@@ -1555,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- if (snapshot_list_has_id(deleted, k.k->p.offset))
+ if (interior_delete_has_id(deleted, k.k->p.offset))
return 0;
s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
@@ -1564,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return ret;
darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
if (!nr_deleted_ancestors)
return 0;
@@ -1582,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
u32 id = le32_to_cpu(s->v.skip[j]);
- if (snapshot_list_has_id(deleted, id)) {
+ if (interior_delete_has_id(deleted, id)) {
id = bch2_snapshot_nth_parent_skip(c,
parent,
depth > 1
@@ -1601,51 +1549,44 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
- struct btree_trans *trans;
- snapshot_id_list deleted = { 0 };
- snapshot_id_list deleted_interior = { 0 };
- int ret = 0;
-
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- trans = bch2_trans_get(c);
+ struct btree_trans *trans = bch2_trans_get(c);
+ snapshot_id_list delete_leaves = {};
+ interior_delete_list delete_interior = {};
+ int ret = 0;
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- NULL, NULL, 0,
- bch2_delete_redundant_snapshot(trans, k));
- bch_err_msg(c, ret, "deleting redundant snapshots");
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
+ check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+ bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err;
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_snapshot_set_equiv(trans, k));
- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
- if (ret)
+ if (!delete_leaves.nr && !delete_interior.nr)
goto err;
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ({
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "deleting leaves");
+ darray_for_each(delete_leaves, i)
+ prt_printf(&buf, " %u", *i);
- BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
- ? snapshot_list_add(c, &deleted, k.k->p.offset)
- : 0;
- }));
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err;
+ prt_printf(&buf, " interior");
+ darray_for_each(delete_interior, i)
+ prt_printf(&buf, " %u->%u", i->id, i->live_child);
+
+ ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
+ printbuf_exit(&buf);
+ if (ret)
+ goto err;
+ }
for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- struct bpos last_pos = POS_MIN;
- snapshot_id_list equiv_seen = { 0 };
struct disk_reservation res = { 0 };
if (!btree_type_has_snapshots(btree))
@@ -1655,33 +1596,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
btree, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
- delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
- &equiv_seen, &last_pos));
+ delete_dead_snapshots_process_key(trans, &iter, k,
+ &delete_leaves,
+ &delete_interior));
bch2_disk_reservation_put(c, &res);
- darray_exit(&equiv_seen);
bch_err_msg(c, ret, "deleting keys from dying snapshots");
if (ret)
goto err;
}
- bch2_trans_unlock(trans);
- down_write(&c->snapshot_create_lock);
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ({
- u32 snapshot = k.k->p.offset;
- u32 equiv = bch2_snapshot_equiv(c, snapshot);
-
- equiv != snapshot
- ? snapshot_list_add(c, &deleted_interior, snapshot)
- : 0;
- }));
-
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err_create_lock;
+ darray_for_each(delete_leaves, i) {
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(trans, *i));
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
+ goto err;
+ }
/*
* Fixing children of deleted snapshots can't be done completely
@@ -1691,30 +1623,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
if (ret)
- goto err_create_lock;
-
- darray_for_each(deleted, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
- if (ret)
- goto err_create_lock;
- }
+ goto err;
- darray_for_each(deleted_interior, i) {
+ darray_for_each(delete_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch2_snapshot_node_delete(trans, i->id));
+ bch_err_msg(c, ret, "deleting snapshot %u", i->id);
if (ret)
- goto err_create_lock;
+ goto err;
}
-err_create_lock:
- up_write(&c->snapshot_create_lock);
err:
- darray_exit(&deleted_interior);
- darray_exit(&deleted);
+ darray_exit(&delete_interior);
+ darray_exit(&delete_leaves);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
@@ -1767,37 +1689,36 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return ret;
}
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot snap;
- int ret = 0;
+ /* If there's one child, it's redundant and keys will be moved to the child */
+ return !!snap.v->children[0] + !!snap.v->children[1] == 1;
+}
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
if (k.k->type != KEY_TYPE_snapshot)
return 0;
- snap = bkey_s_c_to_snapshot(k);
+ struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v) ||
- bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
- (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- return 0;
- }
+ interior_snapshot_needs_delete(snap))
+ set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
- return ret;
+ return 0;
}
int bch2_snapshots_read(struct bch_fs *c)
{
+ /*
+ * Initializing the is_ancestor bitmaps requires ancestors to already be
+ * initialized - so mark in reverse:
+ */
int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
+ POS_MAX, 0, k,
__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_snapshot_set_equiv(trans, k) ?:
- bch2_check_snapshot_needs_deletion(trans, k)) ?:
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+ bch2_check_snapshot_needs_deletion(trans, k)));
bch_err_fn(c, ret);
/*
diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h
index ae23d45f..00373cf3 100644
--- a/libbcachefs/snapshot.h
+++ b/libbcachefs/snapshot.h
@@ -119,19 +119,19 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
return id;
}
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
{
const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->equiv : 0;
+ return s ? s->live : 0;
}
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = __bch2_snapshot_equiv(c, id);
+ bool ret = __bch2_snapshot_exists(c, id);
rcu_read_unlock();
- return id;
+ return ret;
}
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c
index c3276a7e..f5977c5c 100644
--- a/libbcachefs/str_hash.c
+++ b/libbcachefs/str_hash.c
@@ -101,38 +101,108 @@ static int hash_pick_winner(struct btree_trans *trans,
}
}
-int bch2_str_hash_check_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+static int repair_inode_hash_info(struct btree_trans *trans,
+ struct bch_inode_unpacked *snapshot_root)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != snapshot_root->bi_inum)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ break;
+
+ if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
+ trans, inode_snapshot_mismatch,
+ "inode hash info in different snapshots don't match")) {
+ inode.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
+ ret = __bch2_fsck_write_inode(trans, &inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ break;
+ }
+ }
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/*
+ * All versions of the same inode in different snapshots must have the same hash
+ * seed/type: verify that the hash info we're using matches the root
+ */
+static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
+ struct bch_hash_info *hash_info)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
- struct printbuf buf = PRINTBUF;
+ struct btree_iter iter;
struct bkey_s_c k;
- u64 hash;
int ret = 0;
- if (hash_k.k->type != desc.key_type)
- return 0;
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found;
+ }
+ bch_err(c, "%s(): inum %llu not found", __func__, inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+found:;
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_unpack(k, &inode);
+ if (ret)
+ goto err;
- hash = desc.hash_bkey(hash_info, hash_k);
+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
+ if (memcmp(hash_info, &hash2, sizeof(hash2))) {
+ ret = repair_inode_hash_info(trans, &inode);
+ if (!ret) {
+ bch_err(c, "inode hash info mismatch with root, but mismatch not found");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
- if (likely(hash == hash_k.k->p.offset))
- return 0;
+int __bch2_str_hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c k;
+ int ret = 0;
+ u64 hash = desc->hash_bkey(hash_info, hash_k);
if (hash_k.k->p.offset < hash)
goto bad_hash;
- for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ for_each_btree_key_norestart(trans, iter, desc->btree_id,
SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
BTREE_ITER_slots, k, ret) {
if (bkey_eq(k.k->p, hash_k.k->p))
break;
- if (k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k))
+ if (k.k->type == desc->key_type &&
+ !desc->cmp_bkey(k, hash_k))
goto duplicate_entries;
if (bkey_deleted(k.k)) {
@@ -145,16 +215,23 @@ out:
printbuf_exit(&buf);
return ret;
bad_hash:
+ /*
+ * Before doing any repair, check hash_info itself:
+ */
+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
+ if (ret)
+ goto out;
+
if (fsck_err(trans, hash_table_key_wrong_offset,
"hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
- bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
if (IS_ERR(new))
return PTR_ERR(new);
- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
(subvol_inum) { 0, hash_k.k->p.inode },
hash_k.k->p.snapshot, new,
STR_HASH_must_create|
@@ -166,9 +243,9 @@ bad_hash:
if (k.k)
goto duplicate_entries;
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-BCH_ERR_transaction_restart_nested;
goto out;
@@ -176,7 +253,7 @@ bad_hash:
fsck_err:
goto out;
duplicate_entries:
- ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+ ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
if (ret < 0)
goto out;
@@ -192,14 +269,14 @@ duplicate_entries:
switch (ret) {
case 0:
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
break;
case 1:
- ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
break;
case 2:
- ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
- bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
goto out;
}
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 0c20f3af..55a4ac7b 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -394,10 +394,25 @@ int bch2_hash_delete(struct btree_trans *trans,
}
struct snapshots_seen;
-int bch2_str_hash_check_key(struct btree_trans *,
- struct snapshots_seen *,
- const struct bch_hash_desc,
- struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c);
+int __bch2_str_hash_check_key(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc *,
+ struct bch_hash_info *,
+ struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_str_hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+ if (hash_k.k->type != desc->key_type)
+ return 0;
+
+ if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
+ return 0;
+
+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+}
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index 0e756e35..e3d04752 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -409,26 +409,56 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
*/
static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
- struct btree_iter iter;
- struct bkey_s_c_subvolume subvol;
- u32 snapid;
- int ret = 0;
+ struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
- subvol = bch2_bkey_get_iter_typed(trans, &iter,
+ struct bkey_s_c_subvolume subvol =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
BTREE_ID_subvolumes, POS(0, subvolid),
BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
- ret = bkey_err(subvol);
+ int ret = bkey_err(subvol);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
"missing subvolume %u", subvolid);
if (ret)
- return ret;
+ goto err;
- snapid = le32_to_cpu(subvol.v->snapshot);
+ u32 snapid = le32_to_cpu(subvol.v->snapshot);
+
+ struct bkey_s_c_snapshot snapshot =
+ bch2_bkey_get_iter_typed(trans, &snapshot_iter,
+ BTREE_ID_snapshots, POS(0, snapid),
+ 0, snapshot);
+ ret = bkey_err(subvol);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing snapshot %u", snapid);
+ if (ret)
+ goto err;
+
+ u32 treeid = le32_to_cpu(snapshot.v->tree);
- ret = bch2_btree_delete_at(trans, &iter, 0) ?:
+ struct bkey_s_c_snapshot_tree snapshot_tree =
+ bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
+ BTREE_ID_snapshot_trees, POS(0, treeid),
+ 0, snapshot_tree);
+
+ if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
+ struct bkey_i_snapshot_tree *snapshot_tree_mut =
+ bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
+ &snapshot_tree.s_c,
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
+ if (ret)
+ goto err;
+
+ snapshot_tree_mut->v.master_subvol = 0;
+ }
+
+ ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
bch2_snapshot_node_set_deleted(trans, snapid);
- bch2_trans_iter_exit(trans, &iter);
+err:
+ bch2_trans_iter_exit(trans, &snapshot_tree_iter);
+ bch2_trans_iter_exit(trans, &snapshot_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
return ret;
}
diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h
index f2ec4277..1549d6da 100644
--- a/libbcachefs/subvolume_types.h
+++ b/libbcachefs/subvolume_types.h
@@ -9,13 +9,13 @@ typedef DARRAY(u32) snapshot_id_list;
#define IS_ANCESTOR_BITMAP 128
struct snapshot_t {
+ bool live;
u32 parent;
u32 skip[3];
u32 depth;
u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 tree;
- u32 equiv;
unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
};
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index dbc09e30..8037ccba 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -1084,9 +1084,16 @@ int bch2_write_super(struct bch_fs *c)
": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
- bch2_fs_fatal_error(c, "%s", buf.buf);
+
+ if (c->opts.errors != BCH_ON_ERROR_continue &&
+ c->opts.errors != BCH_ON_ERROR_fix_safe) {
+ ret = -BCH_ERR_erofs_sb_err;
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ } else {
+ bch_err(c, "%s", buf.buf);
+ }
+
printbuf_exit(&buf);
- ret = -BCH_ERR_erofs_sb_err;
}
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index bd6aecea..d97ea7bd 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -563,6 +563,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
+ bch2_fs_btree_gc_exit(c);
bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
@@ -770,13 +771,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
spin_lock_init(&c->recovery_pass_lock);
sema_init(&c->online_fsck_mutex, 1);
- init_rwsem(&c->gc_lock);
- mutex_init(&c->gc_gens_lock);
-
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
- bch2_fs_gc_init(c);
bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_btree_iter_init_early(c);
@@ -911,6 +908,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_btree_gc_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
@@ -1306,8 +1304,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
- init_rwsem(&ca->bucket_lock);
-
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 97733c76..a7eb1f51 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -203,7 +203,6 @@ read_attribute(disk_groups);
read_attribute(has_data);
read_attribute(alloc_debug);
-read_attribute(accounting);
read_attribute(usage_base);
#define x(t, n, ...) read_attribute(t);
@@ -397,9 +396,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_alloc_debug)
bch2_fs_alloc_debug_to_text(out, c);
- if (attr == &sysfs_accounting)
- bch2_fs_accounting_to_text(out, c);
-
if (attr == &sysfs_usage_base)
bch2_fs_usage_base_to_text(out, c);
@@ -509,15 +505,22 @@ SHOW(bch2_fs_counters)
printbuf_tabstop_push(out, 32);
- #define x(t, ...) \
+ #define x(t, n, f, ...) \
if (attr == &sysfs_##t) { \
counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+ if (f & TYPE_SECTORS) { \
+ counter <<= 9; \
+ counter_since_mount <<= 9; \
+ } \
+ \
prt_printf(out, "since mount:\t"); \
+ (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
prt_human_readable_u64(out, counter_since_mount); \
prt_newline(out); \
\
prt_printf(out, "since filesystem creation:\t"); \
+ (f & TYPE_COUNTER) ? prt_u64(out, counter) : \
prt_human_readable_u64(out, counter); \
prt_newline(out); \
}
@@ -595,7 +598,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_disk_groups,
&sysfs_alloc_debug,
- &sysfs_accounting,
&sysfs_usage_base,
NULL
};
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 7baf66be..9d40b7d4 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -1338,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1374,10 +1380,21 @@ TRACE_EVENT(path_downgrade,
__entry->pos_snapshot)
);
-DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+TRACE_EVENT(key_cache_fill,
+ TP_PROTO(struct btree_trans *trans, const char *key),
+ TP_ARGS(trans, key),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __string(key, key )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(key);
+ ),
+
+ TP_printk("%s %s", __entry->trans_fn, __get_str(key))
);
TRACE_EVENT(write_buffer_flush,
@@ -1436,6 +1453,24 @@ TRACE_EVENT(write_buffer_flush_slowpath,
TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
);
+TRACE_EVENT(write_buffer_maybe_flush,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
+ TP_ARGS(trans, caller_ip, key),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __string(key, key )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(key);
+ ),
+
+ TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
+);
+
DEFINE_EVENT(fs_str, rebalance_extent,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index c292b9ce..1a172011 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -55,6 +55,16 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
+static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
+{
+ void *p = unlikely(n >= INT_MAX)
+ ? vmalloc(n)
+ : kvmalloc(n, flags & ~__GFP_ZERO);
+ if (p && (flags & __GFP_ZERO))
+ memset(p, 0, n);
+ return p;
+}
+
#define init_heap(heap, _size, gfp) \
({ \
(heap)->nr = 0; \
diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c
index 6a78553d..6620ecae 100644
--- a/libbcachefs/varint.c
+++ b/libbcachefs/varint.c
@@ -9,6 +9,7 @@
#include <valgrind/memcheck.h>
#endif
+#include "errcode.h"
#include "varint.h"
/**
@@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
u64 v;
if (unlikely(in + bytes > end))
- return -1;
+ return -BCH_ERR_varint_decode_error;
if (likely(bytes < 9)) {
__le64 v_le = 0;
@@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
unsigned bytes = ffz(*in) + 1;
if (unlikely(in + bytes > end))
- return -1;
+ return -BCH_ERR_varint_decode_error;
if (likely(bytes < 9)) {
v >>= bytes;