summaryrefslogtreecommitdiff
path: root/tests/btrfs/236
blob: ed3bddb880d3ab0f6eb797c23f420985b3e29084 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
#
# FSQA Test No. 236
#
# Test for fsync data loss after renaming a file or adding a hard link, with a
# previous fsync of another file, as well as that mtime and ctime are correct.
# Test both with COW and NOCOW writes.
#
. ./common/preamble
_begin_fstest auto quick log

# Override the default cleanup function.
_cleanup()
{
	_cleanup_flakey
	cd /
	rm -f $tmp.*
}

. ./common/filter
. ./common/dmflakey

_supported_fs btrfs
_require_scratch
_require_dm_target flakey

# The comments inside the function mentioning specific inode numbers and IDs
# (transactions, log commits, etc) are for the case where the function is run
# on a freshly created filesystem, but the logic and reasoning still applies
# for future invocations.
test_fsync()
{
	local suffix=$1
	local op_type=$2
	local foo="$SCRATCH_MNT/foo_$suffix"
	local bar="$SCRATCH_MNT/bar_$suffix"
	local baz="$SCRATCH_MNT/baz_$suffix"
	local digest_before
	local digest_after
	local mtime_before
	local ctime_before
	local mtime_after
	local ctime_after

	if [ "$op_type" != "rename" ] && [ "$op_type" != "link" ]; then
		_fail "Operation type, 2nd argument, must be 'rename' or 'link'"
	fi

	# Create an extra empty file to be used later for syncing the log and
	# bumping the committed log transaction ID by +1.
	touch $bar

	# This is the file we are interested in testing for data loss after an
	# fsync. Create it and fill it with initial data, then fsync it.
	#
	# Before the fsync the inode has the full sync flag set, the current
	# log transaction has an ID of 0 (first log transaction), the inode's
	# ->logged_trans is 0, ->last_sub_trans is 0 and ->last_log_commit is -1.
	#
	# After the fsync the full sync flag is not set anymore, ->logged_trans
	# is 6 (generation of the current transaction), ->last_log_commit is 0
	# (which is the value of ->last_sub_trans) and ->last_sub_trans remains
	# as 0.
	#
	# Also, after the fsync, the log transaction ID is bumped from 0 to 1.
	#
	$XFS_IO_PROG -f -c "pwrite -S 0xab 0 1M" -c "fsync" $baz >>$seqres.full
	# File bar is inode 257.
	# File baz is inode 258.

	# During the rename or link operation below the following happens:
	#
	# We update inode 258 and that causes its ->last_sub_trans to be set to
	# 1 (the current log transaction ID), and its ->last_log_commit remains
	# with a value of 0 (meaning it was committed in the previous log
	# transaction).
	#
	# After updating inode 258, because we have previously fsynced it, we
	# log again the inode because it has a new name/dentry now. This results
	# in updating its ->last_log_commit from 0 to 1 (the current value of
	# its ->last_sub_trans).
	#
	if [ "$op_type" == "rename" ]; then
		mv $baz $foo
	else
		ln $baz $foo
	fi
	# File bar is inode 257.
	# File foo is inode 258.

	# Before the next write, sleep for 1 second so that we can test if mtime
	# and ctime are preserved after the power failure.
	sleep 1

	# This buffered write leaves ->last_sub_trans of inode 258 as 1, the ID
	# of the current log transaction (inode->root->log_transid).
	$XFS_IO_PROG -c "pwrite -S 0xcd 0 1M" $foo >>$seqres.full

	# The fsync against inode 257, file bar, results in committing the log
	# transaction with ID 1, updating inode->root->last_log_commit to 1, and
	# bumping root->log_transid from 1 to 2.
	$XFS_IO_PROG -c "fsync" $bar

	# Now on fsync of inode 258, file foo, delalloc is flushed and, because
	# the inode does not have the full sync flag set anymore, it only waits
	# for the writeback to finish, it does not wait for the ordered extent
	# to complete.
	#
	# If the ordered extent does not complete before btrfs_sync_file() calls
	# btrfs_inode_in_log(), then we would not update the inode the log and
	# sync the log, resulting in a guaranteed data loss after a power failure
	# for COW writes and with slim chances for data loss as well for NOCOW
	# writes, because the fsync would return success to user space without
	# issuing barriers (REQ_PREFLUSH not sent to the block layer).
	# That happened because btrfs_inode_in_log() would return true before the
	# ordered extent completes, as in that case the inode has ->last_sub_trans
	# set to 1, ->last_log_commit as 1 and root->last_log_commit is 1 as well.
	#
	# Also, for both COW and NOCOW writes, when the race happens we ended up
	# not logging the inode, resulting in an outdated mtime and ctime after
	# replaying the log.
	#
	$XFS_IO_PROG -c "fsync" $foo

	# Simulate a power failure and then mount again the filesystem to replay
	# the log tree.
	# After the power failure We expect $foo, inode 258, to have 1M of data
	# full of bytes with a value of 0xcd, and not 0xab.

	digest_before=$(_md5_checksum $foo)
	mtime_before=$(stat -c %Y $foo)
	ctime_before=$(stat -c %Z $foo)

	_flakey_drop_and_remount

	digest_after=$(_md5_checksum $foo)
	mtime_after=$(stat -c %Y $foo)
	ctime_after=$(stat -c %Z $foo)

	if [ $digest_after != $digest_before ]; then
		echo -n "Incorrect data after log replay, "
		echo "expected digest: $digest_before got: $digest_after"
	fi

	if [ $mtime_after != $mtime_before ]; then
		echo "mtime not preserved, expected: $mtime_before got: $mtime_after"
	fi

	if [ $ctime_after != $ctime_before ]; then
		echo "ctime not preserved, expected: $ctime_before got: $ctime_after"
	fi
}

# Test first with data cow (it's the default, but be explicit to make it clear).
_scratch_mkfs >>$seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_init_flakey
MOUNT_OPTIONS="-o datacow"
_mount_flakey

# Test a few times each scenario because this test was motivated by a race
# condition.
echo "Testing fsync after rename with COW writes"
for ((i = 1; i <= 3; i++)); do
	test_fsync "rename_cow_$i" "rename"
done
echo "Testing fsync after link with COW writes"
for ((i = 1; i <= 3; i++)); do
	test_fsync "link_cow_$i" "link"
done

_unmount_flakey

# Now lets test with nodatacow.
if ! _scratch_btrfs_is_zoned; then
	MOUNT_OPTIONS="-o nodatacow"
	_mount_flakey

	echo "Testing fsync after rename with NOCOW writes"
	for ((i = 1; i <= 3; i++)); do
		test_fsync "rename_nocow_$i" "rename"
	done
	echo "Testing fsync after link with NOCOW writes"
	for ((i = 1; i <= 3; i++)); do
		test_fsync "link_nocow_$i" "link"
	done

	_unmount_flakey
else
	# Fake result. Zoned btrfs does not support NOCOW
	echo "Testing fsync after rename with NOCOW writes"
	echo "Testing fsync after link with NOCOW writes"
fi

status=0
exit