tests/btrfs/240


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
#
# FSQA Test No. 240
#
# Test a scenario where we do several partial writes into multiple preallocated
# extents across two transactions and with several fsyncs in between. The goal
# is to check that the fsyncs succeed. This scenario used to trigger an -EIO
# failure on the last fsync and turn the filesystem to RO mode because of a
# transaction abort.
#
. ./common/preamble
_begin_fstest auto quick prealloc log

# Override the default cleanup function.
_cleanup()
{
	_cleanup_flakey
	cd /
	rm -f $tmp.*
}

. ./common/filter
. ./common/dmflakey

_supported_fs btrfs
_require_scratch
_require_dm_target flakey
_require_xfs_io_command "falloc"

_scratch_mkfs >>$seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_init_flakey
_mount_flakey

# Create our test file with 2 preallocated extents. Leave a 1M hole between them
# to ensure that we get two file extent items that will never be merged into a
# single one. The extents are contiguous on disk, which will later result in the
# checksums for their data to be merged into a single checksum item in the csums
# btree.
#
$XFS_IO_PROG -f \
	     -c "falloc 0 1M" \
	     -c "falloc 3M 3M" \
	     $SCRATCH_MNT/foobar

# Now write to the second extent and leave only 1M of it as unwritten, which
# corresponds to the file range [4M, 5M[.
#
# Then fsync the file to flush delalloc and to clear full sync flag from the
# inode, so that a future fsync will use the fast code path.
#
# After the writeback triggered by the fsync we have 3 file extent items that
# point to the second extent we previously allocated with fallocate():
#
# 1) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
#    range [3M, 4M[
#
# 2) One file extent item of type BTRFS_FILE_EXTENT_PREALLOC that covers the
#    file range [4M, 5M[
#
# 3) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
#    range [5M, 6M[
#
# All these file extent items have a generation of 6, which is the ID of the
# transaction where they were created. The split of the original file extent
# item is done at btrfs_mark_extent_written() when ordered extents complete for
# the file ranges [3M, 4M[ and [5M, 6M[.
#
$XFS_IO_PROG -c "pwrite -S 0xab 3M 1M" \
	     -c "pwrite -S 0xef 5M 1M" \
	     -c "fsync" \
	     $SCRATCH_MNT/foobar | _filter_xfs_io

# Commit the current transaction. This wipes out the log tree created by the
# previous fsync.
sync

# Now write to the unwritten range of the second extent we allocated,
# corresponding to the file range [4M, 5M[, and fsync the file, which triggers
# the fast fsync code path.
#
# The fast fsync code path sees that there is a new extent map covering the file
# range [4M, 5M[ and therefore it will log a checksum item covering the range
# [1M, 2M[ of the second extent we allocated.
#
# Also, after the fsync finishes we no longer have the 3 file extent items that
# pointed to 3 sections of the second extent we allocated. Instead we end up
# with a single file extent item pointing to the whole extent, with a type of
# BTRFS_FILE_EXTENT_REG and a generation of 7 (the current transaction ID). This
# is due to the file extent item merging we do when completing ordered extents
# into ranges that point to unwritten (preallocated) extents. This merging is
# done at btrfs_mark_extent_written().
#
$XFS_IO_PROG -c "pwrite -S 0xcd 4M 1M" \
	     -c "fsync" \
	     $SCRATCH_MNT/foobar | _filter_xfs_io

# Now do some write to our file outside the range of the second extent that we
# allocated with fallocate() and truncate the file size from 6M down to 5M.
#
# The truncate operation sets the full sync runtime flag on the inode, forcing
# the next fsync to use the slow code path. It also changes the length of the
# second file extent item so that it represents the file range [3M, 5M[ and not
# the range [3M, 6M[ anymore.
#
# Finally fsync the file. Since this is a fsync that triggers the slow code path,
# it will remove all items associated to the inode from the log tree and then it
# will scan for file extent items in the fs/subvolume tree that have a generation
# matching the current transaction ID, which is 7. This means it will log 2 file
# extent items:
#
# 1) One for the first extent we allocated, covering the file range [0, 1M[
#
# 2) Another for the first 2M of the second extent we allocated, covering the
#    file range [3M, 5M[
#
# When logging the first file extent item we log a single checksum item that has
# all the checksums for the entire extent.
#
# When logging the second file extent item, we also lookup for the checksums that
# are associated with the range [0, 2M[ of the second extent we allocated (file
# range [3M, 5M[), and then we log them with btrfs_csum_file_blocks(). However
# that results in ending up with a log that has two checksum items with ranges
# that overlap:
#
# 1) One for the range [1M, 2M[ of the second extent we allocated, corresponding
#    to the file range [4M, 5M[, which we logged in the previous fsync that used
#    the fast code path;
#
# 2) One for the ranges [0, 1M[ and [0, 2M[ of the first and second extents,
#    respectively, corresponding to the files ranges [0, 1M[ and [3M, 5M[.
#    This one was added during this last fsync that uses the slow code path
#    and overlaps with the previous one logged by the previous fast fsync.
#
# This happens because when logging the checksums for the second extent, we
# notice they start at an offset that matches the end of the checksums item that
# we logged for the first extent, and because both extents are contiguous on
# disk, btrfs_csum_file_blocks() decides to extend that existing checksums item
# and append the checksums for the second extent to this item. The end result is
# we end up with two checksum items in the log tree that have overlapping ranges,
# as listed before, resulting in the fsync to fail with -EIO and aborting the
# transaction, turning the filesystem into RO mode.
#
$XFS_IO_PROG -c "pwrite -S 0xff 0 1M" \
	     -c "truncate 5M" \
	     -c "fsync" \
	     $SCRATCH_MNT/foobar | _filter_xfs_io

echo "File content before power failure:"
od -A d -t x1 $SCRATCH_MNT/foobar

# Simulate a power failure and mount again the filesystem. The file content
# must be the same that we had before.
_flakey_drop_and_remount

echo "File content before after failure:"
od -A d -t x1 $SCRATCH_MNT/foobar

_unmount_flakey

status=0
exit