summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQu Wenruo <wqu@suse.com>2025-05-01 08:37:54 +0930
committerDavid Sterba <dsterba@suse.com>2025-05-15 14:30:54 +0200
commitec1f3a207cdf314eae4d4ae145f1ffdb829f0652 (patch)
treea0240bde2fc7b41fd74666b000a4abb1a1e2d82c
parent45a59513b4b2d9b822b241c73b57b29b9e92b245 (diff)
btrfs: scrub: update device stats when an error is detected
[BUG] Since the migration to the new scrub_stripe interface, scrub no longer updates the device stats when hitting an error, no matter if it's a read or checksum mismatch error. E.g: BTRFS info (device dm-2): scrub: started on devid 1 BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488 BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file) BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488 BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file) BTRFS info (device dm-2): scrub: finished on devid 1 with status: 0 Note there is no line showing the device stats error update. [CAUSE] In the migration to the new scrub_stripe interface, we no longer call btrfs_dev_stat_inc_and_print(). [FIX] - Introduce a new bitmap for metadata generation errors * A new bitmap @meta_gen_error_bitmap is introduced to record which blocks have metadata generation mismatch errors. * A new counter for that bitmap @init_nr_meta_gen_errors, is also introduced to store the number of generation mismatch errors that are found during the initial read. This is for the error reporting at scrub_stripe_report_errors(). * New dedicated error message for unrepaired generation mismatches * Update @meta_gen_error_bitmap if a transid mismatch is hit - Add btrfs_dev_stat_inc_and_print() calls to the following call sites * scrub_stripe_report_errors() * scrub_write_endio() This is only for the write errors. This means there is a minor behavior change: - The timing of device stats error message Since we concentrate the error messages at scrub_stripe_report_errors(), the device stats error messages will all show up in one go, after the detailed scrub error messages: BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488 BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file) BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488 BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file) BTRFS error (device dm-2): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 1, gen 0 BTRFS error (device dm-2): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 2, gen 0 Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/scrub.c32
1 files changed, 29 insertions, 3 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5d6166fd917e..bf503419e0e9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -153,12 +153,14 @@ struct scrub_stripe {
unsigned int init_nr_io_errors;
unsigned int init_nr_csum_errors;
unsigned int init_nr_meta_errors;
+ unsigned int init_nr_meta_gen_errors;
/*
* The following error bitmaps are all for the current status.
* Every time we submit a new read, these bitmaps may be updated.
*
- * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+ * error_bitmap = io_error_bitmap | csum_error_bitmap |
+ * meta_error_bitmap | meta_generation_bitmap;
*
* IO and csum errors can happen for both metadata and data.
*/
@@ -166,6 +168,7 @@ struct scrub_stripe {
unsigned long io_error_bitmap;
unsigned long csum_error_bitmap;
unsigned long meta_error_bitmap;
+ unsigned long meta_gen_error_bitmap;
/* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap;
@@ -662,7 +665,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
}
if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad generation, has %llu want %llu",
@@ -674,6 +677,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
}
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -971,8 +975,22 @@ skip:
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
+ if (test_bit(sector_nr, &stripe->meta_gen_error_bitmap))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("generation error", dev, false,
+ stripe->logical, physical);
}
+ /* Update the device stats. */
+ for (int i = 0; i < stripe->init_nr_io_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
+ for (int i = 0; i < stripe->init_nr_csum_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ /* Generation mismatch error is based on each metadata, not each block. */
+ for (int i = 0; i < stripe->init_nr_meta_gen_errors;
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits))
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
+
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
@@ -981,7 +999,8 @@ skip:
sctx->stat.no_csum += nr_nodatacsum_sectors;
sctx->stat.read_errors += stripe->init_nr_io_errors;
sctx->stat.csum_errors += stripe->init_nr_csum_errors;
- sctx->stat.verify_errors += stripe->init_nr_meta_errors;
+ sctx->stat.verify_errors += stripe->init_nr_meta_errors +
+ stripe->init_nr_meta_gen_errors;
sctx->stat.uncorrectable_errors +=
bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
@@ -1027,6 +1046,8 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
stripe->nr_sectors);
stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
stripe->nr_sectors);
+ stripe->init_nr_meta_gen_errors = bitmap_weight(&stripe->meta_gen_error_bitmap,
+ stripe->nr_sectors);
if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
goto out;
@@ -1141,6 +1162,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
}
bio_put(&bbio->bio);
@@ -1502,10 +1526,12 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
stripe->init_nr_io_errors = 0;
stripe->init_nr_csum_errors = 0;
stripe->init_nr_meta_errors = 0;
+ stripe->init_nr_meta_gen_errors = 0;
stripe->error_bitmap = 0;
stripe->io_error_bitmap = 0;
stripe->csum_error_bitmap = 0;
stripe->meta_error_bitmap = 0;
+ stripe->meta_gen_error_bitmap = 0;
}
/*