#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
#
# FSQA Test No. 240
#
# Test a scenario where we do several partial writes into multiple preallocated
# extents across two transactions and with several fsyncs in between. The goal
# is to check that the fsyncs succeed. This scenario used to trigger an -EIO
# failure on the last fsync and turn the filesystem to RO mode because of a
# transaction abort.
#
. ./common/preamble
_begin_fstest auto quick prealloc log

# Override the default cleanup function.
_cleanup()
{
	_cleanup_flakey
	cd /
	rm -f $tmp.*
}

# Import common functions.
. ./common/filter
. ./common/dmflakey

# real QA test starts here
_supported_fs btrfs
_require_scratch
_require_dm_target flakey
_require_xfs_io_command "falloc"

_scratch_mkfs >>$seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_init_flakey
_mount_flakey

# Create our test file with 2 preallocated extents. Leave a 1M hole between them
# to ensure that we get two file extent items that will never be merged into a
# single one. The extents are contiguous on disk, which will later result in the
# checksums for their data to be merged into a single checksum item in the csums
# btree.
#
$XFS_IO_PROG -f \
	     -c "falloc 0 1M" \
	     -c "falloc 3M 3M" \
	     $SCRATCH_MNT/foobar

# Now write to the second extent and leave only 1M of it as unwritten, which
# corresponds to the file range [4M, 5M[.
#
# Then fsync the file to flush delalloc and to clear full sync flag from the
# inode, so that a future fsync will use the fast code path.
#
# After the writeback triggered by the fsync we have 3 file extent items that
# point to the second extent we previously allocated with fallocate():
#
# 1) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
#    range [3M, 4M[
#
# 2) One file extent item of type BTRFS_FILE_EXTENT_PREALLOC that covers the
#    file range [4M, 5M[
#
# 3) One file extent item of type BTRFS_FILE_EXTENT_REG that covers the file
#    range [5M, 6M[
#
# All these file extent items have a generation of 6, which is the ID of the
# transaction where they were created. The split of the original file extent
# item is done at btrfs_mark_extent_written() when ordered extents complete for
# the file ranges [3M, 4M[ and [5M, 6M[.
#
$XFS_IO_PROG -c "pwrite -S 0xab 3M 1M" \
	     -c "pwrite -S 0xef 5M 1M" \
	     -c "fsync" \
	     $SCRATCH_MNT/foobar | _filter_xfs_io

# Commit the current transaction. This wipes out the log tree created by the
# previous fsync.
sync

# Now write to the unwritten range of the second extent we allocated,
# corresponding to the file range [4M, 5M[, and fsync the file, which triggers
# the fast fsync code path.
#
# The fast fsync code path sees that there is a new extent map covering the file
# range [4M, 5M[ and therefore it will log a checksum item covering the range
# [1M, 2M[ of the second extent we allocated.
#
# Also, after the fsync finishes we no longer have the 3 file extent items that
# pointed to 3 sections of the second extent we allocated. Instead we end up
# with a single file extent item pointing to the whole extent, with a type of
# BTRFS_FILE_EXTENT_REG and a generation of 7 (the current transaction ID). This
# is due to the file extent item merging we do when completing ordered extents
# into ranges that point to unwritten (preallocated) extents. This merging is
# done at btrfs_mark_extent_written().
#
$XFS_IO_PROG -c "pwrite -S 0xcd 4M 1M" \
	     -c "fsync" \
	     $SCRATCH_MNT/foobar | _filter_xfs_io

# Now do some write to our file outside the range of the second extent that we
# allocated with fallocate() and truncate the file size from 6M down to 5M.
#
# The truncate operation sets the full sync runtime flag on the inode, forcing
# the next fsync to use the slow code path. It also changes the length of the
# second file extent item so that it represents the file range [3M, 5M[ and not
# the range [3M, 6M[ anymore.
#
# Finally fsync the file. Since this is a fsync that triggers the slow code path,
# it will remove all items associated to the inode from the log tree and then it
# will scan for file extent items in the fs/subvolume tree that have a generation
# matching the current transaction ID, which is 7. This means it will log 2 file
# extent items:
#
# 1) One for the first extent we allocated, covering the file range [0, 1M[
#
# 2) Another for the first 2M of the second extent we allocated, covering the
#    file range [3M, 5M[
#
# When logging the first file extent item we log a single checksum item that has
# all the checksums for the entire extent.
#
# When logging the second file extent item, we also lookup for the checksums that
# are associated with the range [0, 2M[ of the second extent we allocated (file
# range [3M, 5M[), and then we log them with btrfs_csum_file_blocks(). However
# that results in ending up with a log that has two checksum items with ranges
# that overlap:
#
# 1) One for the range [1M, 2M[ of the second extent we allocated, corresponding
#    to the file range [4M, 5M[, which we logged in the previous fsync that used
#    the fast code path;
#
# 2) One for the ranges [0, 1M[ and [0, 2M[ of the first and second extents,
#    respectively, corresponding to the files ranges [0, 1M[ and [3M, 5M[.
#    This one was added during this last fsync that uses the slow code path
#    and overlaps with the previous one logged by the previous fast fsync.
#
# This happens because when logging the checksums for the second extent, we
# notice they start at an offset that matches the end of the checksums item that
# we logged for the first extent, and because both extents are contiguous on
# disk, btrfs_csum_file_blocks() decides to extend that existing checksums item
# and append the checksums for the second extent to this item. The end result is
# we end up with two checksum items in the log tree that have overlapping ranges,
# as listed before, resulting in the fsync to fail with -EIO and aborting the
# transaction, turning the filesystem into RO mode.
#
$XFS_IO_PROG -c "pwrite -S 0xff 0 1M" \
	     -c "truncate 5M" \
	     -c "fsync" \
	     $SCRATCH_MNT/foobar | _filter_xfs_io

echo "File content before power failure:"
od -A d -t x1 $SCRATCH_MNT/foobar

# Simulate a power failure and mount again the filesystem. The file content
# must be the same that we had before.
_flakey_drop_and_remount

echo "File content before after failure:"
od -A d -t x1 $SCRATCH_MNT/foobar

_unmount_flakey

status=0
exit
