diff options
Diffstat (limited to 'tests/fs/bcachefs/replication.ktest')
-rwxr-xr-x | tests/fs/bcachefs/replication.ktest | 266 |
1 files changed, 258 insertions, 8 deletions
diff --git a/tests/fs/bcachefs/replication.ktest b/tests/fs/bcachefs/replication.ktest index 14c74f7..cadeaa1 100755 --- a/tests/fs/bcachefs/replication.ktest +++ b/tests/fs/bcachefs/replication.ktest @@ -62,7 +62,7 @@ test_twodevices() test_largebuckets() { - set_watchdog 30 + set_watchdog 60 run_quiet "" bcachefs format -f \ --bucket_size=8M \ ${ktest_scratch_dev[0]} \ @@ -425,10 +425,10 @@ test_device_set_state_offline() mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]}:${ktest_scratch_dev[2]} /mnt local fioout="$ktest_out/fio-out" - run_fio_randrw >"$fioout" 2>&1 & + run_fio_randrw --runtime=60 >"$fioout" 2>&1 & local fiopid=$! - sleep 1 + sleep 10 bcachefs device set-state --force ro ${ktest_scratch_dev[1]} @@ -460,10 +460,10 @@ test_device_readd() mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt local fioout="$ktest_out/fio-out" - run_fio_randrw >"$fioout" 2>&1 & + run_fio_randrw --runtime=60 >"$fioout" 2>&1 & local fiopid=$! - sleep 1 + sleep 10 echo -n "offlining ${ktest_scratch_dev[0]}... " bcachefs device offline --force ${ktest_scratch_dev[0]} @@ -500,11 +500,11 @@ test_device_repeated_add_remove() for ii in {1..10}; do echo "add-remove run #$ii ----------------------------------------------------" - echo "bcachefs device add" + echo "bcachefs device add ${ktest_scratch_dev[1]}" bcachefs device add -f /mnt ${ktest_scratch_dev[1]} - echo "bcachefs device evacuate" + echo "bcachefs device evacuate ${ktest_scratch_dev[1]}" bcachefs device evacuate ${ktest_scratch_dev[1]} - echo "bcachefs device remove" + echo "bcachefs device remove ${ktest_scratch_dev[1]}" bcachefs device remove ${ktest_scratch_dev[1]} done @@ -586,6 +586,193 @@ test_replicas_read_errors() do_replicas_errors_test error_reads } +test_read_corrupt() +{ + setup_tracing + set_watchdog 180 + + echo 8 > /sys/module/bcachefs/parameters/read_corrupt_ratio + + run_quiet "" bcachefs format -f \ + ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + mount -t bcachefs -o degraded ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + + run_fio_randrw --continue_on_error=io + echo 0 > /sys/module/bcachefs/parameters/read_corrupt_ratio + + # Check the read retry path for indirect extents: + #cp --reflink /mnt/fiotest /mnt/fiotest.reflinked + #dd if=/mnt/fiotest of=/dev/null bs=4k iflag=direct + + umount /mnt + + bcachefs fsck -ny ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + bcachefs_test_end_checks ${ktest_scratch_dev[0]} +} + +test_btree_read_corrupt() +{ + setup_tracing + set_watchdog 180 + + + run_quiet "" bcachefs format -f --replicas=2 \ + ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + run_fio_randrw --continue_on_error=io + umount /mnt + + echo 8 > /sys/module/bcachefs/parameters/btree_read_corrupt_ratio + mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + echo 0 > /sys/module/bcachefs/parameters/btree_read_corrupt_ratio + + dd if=/mnt/fiotest of=/dev/null bs=1M + umount /mnt + + bcachefs fsck -ny ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + bcachefs_test_end_checks ${ktest_scratch_dev[0]} +} + +test_kill_btree_node() +{ + set_watchdog 240 + run_quiet "" bcachefs format -f --replicas=2 ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + cp -rL /usr/bin /mnt + umount /mnt + + # Doesn't yet work with the alloc btree: + for btree in extents ; do + echo "Killing a btree node in btree $btree " + local index=1 + + [[ $btree = freespace ]] && index=0 + + bcachefs kill_btree_node -d0 -n$btree:0:$index ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + echo "Running fsck" + # How to assert exit status equals something specific with -o errexit? + mount -t bcachefs -o fsck ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + echo "Checking contents" + diff -rq /usr/bin /mnt/bin + umount /mnt + #bcachefs fsck -y ${ktest_scratch_dev[0]} || true + + echo + echo "Running fsck again; should be clean" + + bcachefs fsck -ny ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + done + + bcachefs_test_end_checks ${ktest_scratch_dev[0]} +} + +test_read_corrupt_replicas() +{ + setup_tracing + set_watchdog 180 + + echo 64 > /sys/module/bcachefs/parameters/read_corrupt_ratio + + run_quiet "" bcachefs format -f \ + --replicas=2 \ + ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + mount -t bcachefs -o degraded ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + + #gc_torture_workload + run_fio_randrw --continue_on_error=io + + # Check the read retry path for indirect extents: + #cp --reflink /mnt/fiotest /mnt/fiotest.reflinked + #dd if=/mnt/fiotest of=/dev/null bs=4k iflag=direct + + umount /mnt + + bcachefs fsck -ny ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + bcachefs_test_end_checks ${ktest_scratch_dev[0]} + echo 0 > /sys/module/bcachefs/parameters/read_corrupt_ratio + true +} + +test_write_corrupt() +{ + setup_tracing + set_watchdog 180 + + run_quiet "" bcachefs format -f ${ktest_scratch_dev[0]} + + mount -t bcachefs ${ktest_scratch_dev[0]} /mnt + + echo 1 > /sys/module/bcachefs/parameters/write_corrupt_ratio + #gc_torture_workload + #run_fio_randrw --continue_on_error=io + dd if=/dev/zero of=/mnt/foo bs=1M count=1024 oflag=direct + echo 0 > /sys/module/bcachefs/parameters/write_corrupt_ratio + + bcachefs device add -f --label=background /mnt ${ktest_scratch_dev[1]} + + echo background > /sys/fs/bcachefs/*/options/background_target + + umount /mnt + + bcachefs fsck -ny ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + bcachefs_test_end_checks ${ktest_scratch_dev[0]} + + echo 0 > /sys/module/bcachefs/parameters/write_corrupt_ratio + true +} + +test_evacuate_corrupted() +{ + setup_tracing + set_watchdog 180 + + run_quiet "" bcachefs format -f --replicas=2 \ + ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + + echo 32 > /sys/module/bcachefs/parameters/write_corrupt_ratio + run_fio_randrw --continue_on_error=io + echo 0 > /sys/module/bcachefs/parameters/write_corrupt_ratio + + bcachefs device evacuate --force ${ktest_scratch_dev[1]} + bcachefs device remove --force ${ktest_scratch_dev[1]} + + run_fio_randrw --continue_on_error=io --verify_only=1 + + umount /mnt + + bcachefs fsck -ny ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} + + # we expect lots of io_move_start_fail when we're degraded: + #bcachefs_test_end_checks ${ktest_scratch_dev[0]} +} + +test_evacuate_errors() +{ + setup_tracing + set_watchdog 180 + + run_quiet "" bcachefs format -f --metadata_replicas=2 \ + ${ktest_scratch_dev[0]} ${ktest_scratch_dev[1]} ${ktest_scratch_dev[2]} + + mount -t bcachefs ${ktest_scratch_dev[0]} /mnt + + run_fio_randrw --continue_on_error=io + + #bcachefs device evacuate /dev/mapper/flakey + bcachefs device remove --force ${ktest_scratch_dev[1]} + + run_fio_randrw --continue_on_error=io --verify_only=1 + umount /mnt + true +} + test_cmd_fs_usage() { set_watchdog 240 @@ -650,6 +837,69 @@ test_rereplicate() bcachefs_test_end_checks ${ktest_scratch_dev[0]} } +test_rereplicate2() +{ + echo ":: format with replicas=1 (default)" + run_quiet "" bcachefs format -f \ + ${ktest_scratch_dev[0]} \ + ${ktest_scratch_dev[1]} + + mount -t bcachefs ${ktest_scratch_dev[0]}:${ktest_scratch_dev[1]} /mnt + + echo ":: write to fs, while replicas=1" + touch /mnt/empty-file + + echo ":: we should have some durability=1 data now" + bcachefs fs usage -h /mnt + + echo ":: set replicas=2 and run rereplicate" + echo 2 > /sys/fs/bcachefs/*/options/data_replicas + echo 2 > /sys/fs/bcachefs/*/options/metadata_replicas + bcachefs data rereplicate /mnt + + # echo ":: running rereplicate a second time seems to guarantee all data has durability=2" + # bcachefs data rereplicate /mnt + + echo ":: all data should be replicated to both devices now, verifying..." + local fs_usage_out=$(bcachefs fs usage -h /mnt) + echo "$fs_usage_out" + local residual_durability_1_data=$(grep -E '^(btree|user):' <<<"$fs_usage_out" | awk '$3 == "1"') + + umount /mnt + + local dev_remove= + if ! [[ -n "$residual_durability_1_data" ]]; then + echo ":: no residual durability=1 data found" + dev_remove="vdb" + echo ":: we will simulate loss of device '$dev_remove' to verify proper replication" + else + echo ":: found residual durability=1 data:" + echo "$residual_durability_1_data" + + local first_spof_dev=$(head -n1 <<<"$residual_durability_1_data" | grep -oP '\[\K[^\]]+' | awk '{print $1}') + dev_remove="$first_spof_dev" + echo ":: we will simulate loss of device '$dev_remove', which we suspect of being a single-point-of-failure" + fi + + # we want to keep the other device + local dev_keep= + if [[ "$dev_remove" == "vdb" ]]; then + dev_keep="vdc" + elif [[ "$dev_remove" == "vdc" ]]; then + dev_keep="vdb" + else + exit 1 + fi + + echo ":: wipe the super-block on device '$dev_remove' to prevent auto-discovery durring mount" + dd if=/dev/zero of=/dev/$dev_remove bs=1M count=1 oflag=direct + + echo ":: attempt degraded mount with only device '$dev_keep'" + mount -t bcachefs -o degraded,fsck,fix_errors /dev/$dev_keep /mnt + + umount /mnt +} + disabled_test_device_add_faults() { setup_tracing 'bcachefs:*' |