xfs: remove old swap extents implementationatomic-file-updates_2020-06-01

Migrate the old XFS_IOC_SWAPEXT implementation to use our shiny new one. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
author: Darrick J. Wong <darrick.wong@oracle.com> 2020-03-30 16:00:59 -0700
committer: Darrick J. Wong <darrick.wong@oracle.com> 2020-06-01 21:16:49 -0700
commit: b616b582b3ffc42130ce4b0de0d5fbf0aedfd855 (patch)
tree: 502f0544c01ba9303a05a616d1b2d4710a30a0d0
parent: 558a2e12d27a76a4b6d8501f3af9415dd6178581 (diff)
5 files changed, 314 insertions, 653 deletions
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a0ce5e5dd5e4..024cb13fb9f0 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1235,570 +1235,3 @@ out_trans_cancel:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
-
-/*
- * We need to check that the format of the data fork in the temporary inode is
- * valid for the target inode before doing the swap. This is not a problem with
- * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
- * data fork depending on the space the attribute fork is taking so we can get
- * invalid formats on the target inode.
- *
- * E.g. target has space for 7 extents in extent format, temp inode only has
- * space for 6.  If we defragment down to 7 extents, then the tmp format is a
- * btree, but when swapped it needs to be in extent format. Hence we can't just
- * blindly swap data forks on attr2 filesystems.
- *
- * Note that we check the swap in both directions so that we don't end up with
- * a corrupt temporary inode, either.
- *
- * Note that fixing the way xfs_fsr sets up the attribute fork in the source
- * inode will prevent this situation from occurring, so all we do here is
- * reject and log the attempt. basically we are putting the responsibility on
- * userspace to get this right.
- */
-int
-xfs_swap_extents_check_format(
-	struct xfs_inode	*ip,	/* target inode */
-	struct xfs_inode	*tip)	/* tmp inode */
-{
-
-	/* Should never get a local format */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
-	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-		return -EINVAL;
-
-	/*
-	 * if the target inode has less extents that then temporary inode then
-	 * why did userspace call us?
-	 */
-	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
-		return -EINVAL;
-
-	/*
-	 * If we have to use the (expensive) rmap swap method, we can
-	 * handle any number of extents and any format.
-	 */
-	if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
-		return 0;
-
-	/*
-	 * if the target inode is in extent form and the temp inode is in btree
-	 * form then we will end up with the target inode in the wrong format
-	 * as we already know there are less extents in the temp inode.
-	 */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-		return -EINVAL;
-
-	/* Check temp in extent form to max in target */
-	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
-			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-		return -EINVAL;
-
-	/* Check target in extent form to max in temp */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
-			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-		return -EINVAL;
-
-	/*
-	 * If we are in a btree format, check that the temp root block will fit
-	 * in the target and that it has enough extents to be in btree format
-	 * in the target.
-	 *
-	 * Note that we have to be careful to allow btree->extent conversions
-	 * (a common defrag case) which will occur when the temp inode is in
-	 * extent format...
-	 */
-	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		if (XFS_IFORK_Q(ip) &&
-		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
-			return -EINVAL;
-		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
-		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-			return -EINVAL;
-	}
-
-	/* Reciprocal target->temp btree format checks */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		if (XFS_IFORK_Q(tip) &&
-		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
-			return -EINVAL;
-		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
-		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int
-xfs_swap_extent_flush(
-	struct xfs_inode	*ip)
-{
-	int	error;
-
-	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
-	if (error)
-		return error;
-	truncate_pagecache_range(VFS_I(ip), 0, -1);
-
-	/* Verify O_DIRECT for ftmp */
-	if (VFS_I(ip)->i_mapping->nrpages)
-		return -EINVAL;
-	return 0;
-}
-
-/*
- * Fix up the owners of the bmbt blocks to refer to the current inode. The
- * change owner scan attempts to order all modified buffers in the current
- * transaction. In the event of ordered buffer failure, the offending buffer is
- * physically logged as a fallback and the scan returns -EAGAIN. We must roll
- * the transaction in this case to replenish the fallback log reservation and
- * restart the scan. This process repeats until the scan completes.
- */
-static int
-xfs_swap_change_owner(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*ip,
-	struct xfs_inode	*tmpip)
-{
-	int			error;
-	struct xfs_trans	*tp = *tpp;
-
-	do {
-		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
-					      NULL);
-		/* success or fatal error */
-		if (error != -EAGAIN)
-			break;
-
-		error = xfs_trans_roll(tpp);
-		if (error)
-			break;
-		tp = *tpp;
-
-		/*
-		 * Redirty both inodes so they can relog and keep the log tail
-		 * moving forward.
-		 */
-		xfs_trans_ijoin(tp, ip, 0);
-		xfs_trans_ijoin(tp, tmpip, 0);
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
-	} while (true);
-
-	return error;
-}
-
-/* Swap the extents of two files by swapping data forks. */
-int
-xfs_swap_extent_forks(
-	struct xfs_trans	**tpp,
-	struct xfs_swapext_req	*req)
-{
-	struct xfs_inode	*ip = req->ip1;
-	struct xfs_inode	*tip = req->ip2;
-	xfs_filblks_t		aforkblks = 0;
-	xfs_filblks_t		taforkblks = 0;
-	int64_t			temp_blks;
-	xfs_extnum_t		junk;
-	uint64_t		tmp;
-	unsigned int		reflink_state;
-	int			src_log_flags = XFS_ILOG_CORE;
-	int			target_log_flags = XFS_ILOG_CORE;
-	int			error;
-
-	reflink_state = xfs_swapext_reflink_prep(req);
-
-	/*
-	 * Count the number of extended attribute blocks
-	 */
-	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
-	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-		error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
-				&aforkblks);
-		if (error)
-			return error;
-	}
-	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
-	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-		error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
-				&taforkblks);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * Btree format (v3) inodes have the inode number stamped in the bmbt
-	 * block headers. We can't start changing the bmbt blocks until the
-	 * inode owner change is logged so recovery does the right thing in the
-	 * event of a crash. Set the owner change log flags now and leave the
-	 * bmbt scan as the last step.
-	 */
-	if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
-		if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-			target_log_flags |= XFS_ILOG_DOWNER;
-		if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-			src_log_flags |= XFS_ILOG_DOWNER;
-	}
-
-	/*
-	 * Swap the data forks of the inodes
-	 */
-	swap(ip->i_df, tip->i_df);
-
-	/* Update quota accounting. */
-	temp_blks = tip->i_d.di_nblocks - taforkblks + aforkblks;
-	xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
-			temp_blks - ip->i_d.di_nblocks);
-
-	temp_blks = ip->i_d.di_nblocks + taforkblks - aforkblks;
-	xfs_trans_mod_dquot_byino(*tpp, tip, XFS_TRANS_DQ_BCOUNT,
-			temp_blks - tip->i_d.di_nblocks);
-
-	/*
-	 * Fix the on-disk inode values
-	 */
-	tmp = (uint64_t)ip->i_d.di_nblocks;
-	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
-	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
-
-	swap(ip->i_d.di_nextents, tip->i_d.di_nextents);
-	swap(ip->i_d.di_format, tip->i_d.di_format);
-
-	/*
-	 * The extents in the source inode could still contain speculative
-	 * preallocation beyond EOF (e.g. the file is open but not modified
-	 * while defrag is in progress). In that case, we need to copy over the
-	 * number of delalloc blocks the data fork in the source inode is
-	 * tracking beyond EOF so that when the fork is truncated away when the
-	 * temporary inode is unlinked we don't underrun the i_delayed_blks
-	 * counter on that inode.
-	 */
-	ASSERT(tip->i_delayed_blks == 0);
-	tip->i_delayed_blks = ip->i_delayed_blks;
-	ip->i_delayed_blks = 0;
-
-	switch (ip->i_d.di_format) {
-	case XFS_DINODE_FMT_EXTENTS:
-		src_log_flags |= XFS_ILOG_DEXT;
-		break;
-	case XFS_DINODE_FMT_BTREE:
-		ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
-		       (src_log_flags & XFS_ILOG_DOWNER));
-		src_log_flags |= XFS_ILOG_DBROOT;
-		break;
-	}
-
-	switch (tip->i_d.di_format) {
-	case XFS_DINODE_FMT_EXTENTS:
-		target_log_flags |= XFS_ILOG_DEXT;
-		break;
-	case XFS_DINODE_FMT_BTREE:
-		target_log_flags |= XFS_ILOG_DBROOT;
-		ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
-		       (target_log_flags & XFS_ILOG_DOWNER));
-		break;
-	}
-
-	xfs_swapext_reflink_finish(*tpp, req, reflink_state);
-
-	xfs_trans_log_inode(*tpp, ip,  src_log_flags);
-	xfs_trans_log_inode(*tpp, tip, target_log_flags);
-
-	/*
-	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
-	 * have inode number owner values in the bmbt blocks that still refer to
-	 * the old inode. Scan each bmbt to fix up the owner values with the
-	 * inode number of the current inode.
-	 */
-	if (src_log_flags & XFS_ILOG_DOWNER) {
-		error = xfs_swap_change_owner(tpp, ip, tip);
-		if (error)
-			return error;
-	}
-	if (target_log_flags & XFS_ILOG_DOWNER) {
-		error = xfs_swap_change_owner(tpp, tip, ip);
-		if (error)
-			return error;
-	}
-
-	return 0;
-}
-
-/*
- * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
- * this if quota enforcement is disabled or if both inodes' dquots are the
- * same.
- */
-STATIC int
-xfs_swap_extents_prep_quota(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	struct xfs_inode	*tip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_filblks_t		aforkblks = 0;
-	xfs_filblks_t		taforkblks = 0;
-	xfs_filblks_t		ip_mapped, tip_mapped;
-	xfs_extnum_t		junk;
-	int			error;
-
-	/*
-	 * Don't bother with a quota reservation if we're not enforcing them
-	 * or the two inodes have the same dquots.
-	 */
-	if (!(mp->m_qflags & XFS_ALL_QUOTA_ENFD) ||
-	    (ip->i_udquot == tip->i_udquot &&
-	     ip->i_gdquot == tip->i_gdquot &&
-	     ip->i_pdquot == tip->i_pdquot))
-		return 0;
-
-	/*
-	 * Count the number of extended attribute blocks
-	 */
-	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
-	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
-				&aforkblks);
-		if (error)
-			return error;
-	}
-	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
-	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
-				&taforkblks);
-		if (error)
-			return error;
-	}
-
-	/* Figure out how many blocks we'll move out of each file. */
-	ip_mapped = ip->i_d.di_nblocks - aforkblks;
-	tip_mapped = tip->i_d.di_nblocks - taforkblks;
-
-	/*
-	 * For each file, compute the net gain in the number of blocks that
-	 * will be mapped into that file and reserve that much quota.  The
-	 * quota counts must be able to absorb at least that much space.
-	 */
-	if (tip_mapped > ip_mapped) {
-		error = xfs_trans_reserve_quota_nblks(tp, ip,
-				tip_mapped - ip_mapped, 0,
-				XFS_QMOPT_RES_REGBLKS);
-		if (error)
-			return error;
-	}
-
-	if (ip_mapped > tip_mapped) {
-		error = xfs_trans_reserve_quota_nblks(tp, tip,
-				ip_mapped - tip_mapped, 0,
-				XFS_QMOPT_RES_REGBLKS);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * For each file, forcibly reserve the gross gain in mapped blocks so
-	 * that we don't trip over any quota block reservation assertions.
-	 * We must reserve the gross gain because the quota code subtracts from
-	 * bcount the number of blocks that we unmap; it does not add that
-	 * quantity back to the quota block reservation.
-	 */
-	error = xfs_trans_reserve_quota_nblks(tp, ip, ip_mapped, 0,
-			XFS_QMOPT_FORCE_RES | XFS_QMOPT_RES_REGBLKS);
-	if (error)
-		return error;
-
-	return xfs_trans_reserve_quota_nblks(tp, tip, tip_mapped, 0,
-			XFS_QMOPT_FORCE_RES | XFS_QMOPT_RES_REGBLKS);
-}
-
-int
-xfs_swap_extents(
-	struct xfs_inode	*ip,	/* target inode */
-	struct xfs_inode	*tip,	/* tmp inode */
-	struct xfs_swapext	*sxp)
-{
-	struct xfs_swapext_req	req = {
-		.ip1		= ip,
-		.ip2		= tip,
-		.whichfork	= XFS_DATA_FORK,
-	};
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_trans	*tp;
-	struct xfs_bstat	*sbp = &sxp->sx_stat;
-	int			error = 0;
-	int			lock_flags;
-	int			resblks = 0;
-
-	/*
-	 * Lock the inodes against other IO, page faults and truncate to
-	 * begin with.  Then we can ensure the inodes are flushed and have no
-	 * page cache safely. Once we have done this we can take the ilocks and
-	 * do the rest of the checks.
-	 */
-	lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
-	lock_flags = XFS_MMAPLOCK_EXCL;
-	xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL);
-
-	/* Verify that both files have the same format */
-	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
-		error = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Verify both files are either real-time or non-realtime */
-	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-		error = -EINVAL;
-		goto out_unlock;
-	}
-
-	error = xfs_qm_dqattach(ip);
-	if (error)
-		goto out_unlock;
-
-	error = xfs_qm_dqattach(tip);
-	if (error)
-		goto out_unlock;
-
-	error = xfs_swap_extent_flush(ip);
-	if (error)
-		goto out_unlock;
-	error = xfs_swap_extent_flush(tip);
-	if (error)
-		goto out_unlock;
-
-	if (xfs_inode_has_cow_data(tip)) {
-		error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
-		if (error)
-			goto out_unlock;
-	}
-
-	/*
-	 * Extent "swapping" with rmap requires a permanent reservation and
-	 * a block reservation because it's really just a remap operation
-	 * performed with log redo items!
-	 */
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
-		int		w	= XFS_DATA_FORK;
-		uint32_t	ipnext	= XFS_IFORK_NEXTENTS(ip, w);
-		uint32_t	tipnext	= XFS_IFORK_NEXTENTS(tip, w);
-
-		/*
-		 * Conceptually this shouldn't affect the shape of either bmbt,
-		 * but since we atomically move extents one by one, we reserve
-		 * enough space to rebuild both trees.
-		 */
-		resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
-		resblks +=  XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
-
-		/*
-		 * Handle the corner case where either inode might straddle the
-		 * btree format boundary. If so, the inode could bounce between
-		 * btree <-> extent format on unmap -> remap cycles, freeing and
-		 * allocating a bmapbt block each time.
-		 */
-		if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
-			resblks += XFS_IFORK_MAXEXT(ip, w);
-		if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
-			resblks += XFS_IFORK_MAXEXT(tip, w);
-	}
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
-	if (error)
-		goto out_unlock;
-
-	/*
-	 * Lock and join the inodes to the tansaction so that transaction commit
-	 * or cancel will unlock the inodes from this point onwards.
-	 */
-	xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
-	lock_flags |= XFS_ILOCK_EXCL;
-	xfs_trans_ijoin(tp, ip, 0);
-	xfs_trans_ijoin(tp, tip, 0);
-
-
-	/* Verify all data are being swapped */
-	if (sxp->sx_offset != 0 ||
-	    sxp->sx_length != ip->i_d.di_size ||
-	    sxp->sx_length != tip->i_d.di_size) {
-		error = -EFAULT;
-		goto out_trans_cancel;
-	}
-
-	trace_xfs_swap_extent_before(ip, 0);
-	trace_xfs_swap_extent_before(tip, 1);
-
-	/* check inode formats now that data is flushed */
-	error = xfs_swap_extents_check_format(ip, tip);
-	if (error) {
-		xfs_notice(mp,
-		    "%s: inode 0x%llx format is incompatible for exchanging.",
-				__func__, ip->i_ino);
-		goto out_trans_cancel;
-	}
-
-	/*
-	 * Compare the current change & modify times with that
-	 * passed in.  If they differ, we abort this swap.
-	 * This is the mechanism used to ensure the calling
-	 * process that the file was not changed out from
-	 * under it.
-	 */
-	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
-	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
-	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-		error = -EBUSY;
-		goto out_trans_cancel;
-	}
-
-	/*
-	 * Reserve ourselves some quota if any of them are in enforcing mode.
-	 * In theory we only need enough to satisfy the change in the number
-	 * of blocks between the two ranges being remapped.
-	 */
-	error = xfs_swap_extents_prep_quota(tp, ip, tip);
-	if (error)
-		goto out_trans_cancel;
-
-	/*
-	 * Note the trickiness in setting the log flags - we set the owner log
-	 * flag on the opposite inode (i.e. the inode we are setting the new
-	 * owner to be) because once we swap the forks and log that, log
-	 * recovery is going to see the fork as owned by the swapped inode,
-	 * not the pre-swapped inodes.
-	 */
-	req.blockcount = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
-		error = xfs_swapext_deferred_bmap(&tp, &req);
-	else
-		error = xfs_swap_extent_forks(&tp, &req);
-	if (error) {
-		trace_xfs_swap_extent_error(ip, error, _THIS_IP_);
-		goto out_trans_cancel;
-	}
-
-	/*
-	 * If this is a synchronous mount, make sure that the
-	 * transaction goes to disk before returning to the user.
-	 */
-	if (mp->m_flags & XFS_MOUNT_WSYNC)
-		xfs_trans_set_sync(tp);
-
-	error = xfs_trans_commit(tp);
-
-	trace_xfs_swap_extent_after(ip, 0);
-	trace_xfs_swap_extent_after(tip, 1);
-
-out_unlock:
-	xfs_iunlock(ip, lock_flags);
-	xfs_iunlock(tip, lock_flags);
-	unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
-	return error;
-
-out_trans_cancel:
-	xfs_trans_cancel(tp);
-	goto out_unlock;
-}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index de3173e64f47..cebdd492fa85 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -66,13 +66,6 @@ int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
 int	xfs_free_eofblocks(struct xfs_inode *ip);
 
-int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
-			 struct xfs_swapext *sx);
-
-struct xfs_swapext_req;
-int xfs_swap_extent_forks(struct xfs_trans **tpp, struct xfs_swapext_req *req);
-int xfs_swap_extents_check_format(struct xfs_inode *ip, struct xfs_inode *tip);
-
 xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
 
 xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 274423ba3bb5..f93de4f7a944 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1864,81 +1864,47 @@ xfs_ioc_scrub_metadata(
 
 int
 xfs_ioc_swapext(
-	xfs_swapext_t	*sxp)
+	struct xfs_swapext	__user *arg)
 {
-	xfs_inode_t     *ip, *tip;
-	struct fd	f, tmp;
-	int		error = 0;
+	struct xfs_swapext	sx;
+	struct file_swap_range	fsr = { 0 };
+	struct fd		fd2, fd1;
+	int			error = 0;
 
-	/* Pull information for the target fd */
-	f = fdget((int)sxp->sx_fdtarget);
-	if (!f.file) {
-		error = -EINVAL;
-		goto out;
-	}
+	if (copy_from_user(&sx, arg, sizeof(struct xfs_swapext)))
+		return -EFAULT;
 
-	if (!(f.file->f_mode & FMODE_WRITE) ||
-	    !(f.file->f_mode & FMODE_READ) ||
-	    (f.file->f_flags & O_APPEND)) {
-		error = -EBADF;
-		goto out_put_file;
-	}
+	fd2 = fdget((int)sx.sx_fdtarget);
+	if (!fd2.file)
+		return -EINVAL;
 
-	tmp = fdget((int)sxp->sx_fdtmp);
-	if (!tmp.file) {
+	fd1 = fdget((int)sx.sx_fdtmp);
+	if (!fd1.file) {
 		error = -EINVAL;
-		goto out_put_file;
+		goto dest_fdput;
 	}
 
-	if (!(tmp.file->f_mode & FMODE_WRITE) ||
-	    !(tmp.file->f_mode & FMODE_READ) ||
-	    (tmp.file->f_flags & O_APPEND)) {
-		error = -EBADF;
-		goto out_put_tmp_file;
-	}
+	fsr.file1_fd = sx.sx_fdtmp;
+	fsr.length = sx.sx_length;
+	fsr.flags = FILE_SWAP_RANGE_NONATOMIC | FILE_SWAP_RANGE_FILE2_FRESH |
+		    FILE_SWAP_RANGE_FULL_FILES;
+	fsr.file2_ino = sx.sx_stat.bs_ino;
+	fsr.file2_mtime = sx.sx_stat.bs_mtime.tv_sec;
+	fsr.file2_ctime = sx.sx_stat.bs_ctime.tv_sec;
+	fsr.file2_mtime_nsec = sx.sx_stat.bs_mtime.tv_nsec;
+	fsr.file2_ctime_nsec = sx.sx_stat.bs_ctime.tv_nsec;
 
-	if (IS_SWAPFILE(file_inode(f.file)) ||
-	    IS_SWAPFILE(file_inode(tmp.file))) {
-		error = -EINVAL;
-		goto out_put_tmp_file;
-	}
+	error = vfs_swap_file_range(fd1.file, fd2.file, &fsr);
 
 	/*
-	 * We need to ensure that the fds passed in point to XFS inodes
-	 * before we cast and access them as XFS structures as we have no
-	 * control over what the user passes us here.
+	 * The old implementation returned EFAULT if the swap range was not
+	 * the entirety of both files.
 	 */
-	if (f.file->f_op != &xfs_file_operations ||
-	    tmp.file->f_op != &xfs_file_operations) {
-		error = -EINVAL;
-		goto out_put_tmp_file;
-	}
-
-	ip = XFS_I(file_inode(f.file));
-	tip = XFS_I(file_inode(tmp.file));
-
-	if (ip->i_mount != tip->i_mount) {
-		error = -EINVAL;
-		goto out_put_tmp_file;
-	}
-
-	if (ip->i_ino == tip->i_ino) {
-		error = -EINVAL;
-		goto out_put_tmp_file;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		error = -EIO;
-		goto out_put_tmp_file;
-	}
-
-	error = xfs_swap_extents(ip, tip, sxp);
-
- out_put_tmp_file:
-	fdput(tmp);
- out_put_file:
-	fdput(f);
- out:
+	if (error == -EDOM)
+		error = -EFAULT;
+	fdput(fd1);
+dest_fdput:
+	fdput(fd2);
 	return error;
 }
 
@@ -2183,18 +2149,8 @@ xfs_file_ioctl(
 	case XFS_IOC_ATTRMULTI_BY_HANDLE:
 		return xfs_attrmulti_by_handle(filp, arg);
 
-	case XFS_IOC_SWAPEXT: {
-		struct xfs_swapext	sxp;
-
-		if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
-			return -EFAULT;
-		error = mnt_want_write_file(filp);
-		if (error)
-			return error;
-		error = xfs_ioc_swapext(&sxp);
-		mnt_drop_write_file(filp);
-		return error;
-	}
+	case XFS_IOC_SWAPEXT:
+		return xfs_ioc_swapext(arg);
 
 	case XFS_IOC_FSCOUNTS: {
 		xfs_fsop_counts_t out;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index bab6a5a92407..b1219d8783d9 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -16,9 +16,7 @@ xfs_ioc_space(
 	struct file		*filp,
 	xfs_flock64_t		*bf);
 
-int
-xfs_ioc_swapext(
-	xfs_swapext_t	*sxp);
+int xfs_ioc_swapext(struct xfs_swapext __user *arg);
 
 extern int
 xfs_find_handle(
diff --git a/fs/xfs/xfs_swaprange.c b/fs/xfs/xfs_swaprange.c
index 901440b812ec..17547b6c3902 100644
--- a/fs/xfs/xfs_swaprange.c
+++ b/fs/xfs/xfs_swaprange.c
@@ -2,6 +2,11 @@
 /*
  * Copyright (C) 2020 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * The xfs_swap_extent_* functions are:
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
  */
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -15,6 +20,7 @@
 #include "xfs_trans.h"
 #include "xfs_quota.h"
 #include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
 #include "xfs_trace.h"
 #include "xfs_swapext.h"
@@ -44,6 +50,281 @@ xfs_swap_range_estimate(
 	return error;
 }
 
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip)	/* tmp inode */
+{
+
+	/* Should never get a local format */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		return -EINVAL;
+
+	/*
+	 * if the target inode has less extents that then temporary inode then
+	 * why did userspace call us?
+	 */
+	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+		return -EINVAL;
+
+	/*
+	 * If we have to use the (expensive) rmap swap method, we can
+	 * handle any number of extents and any format.
+	 */
+	if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
+		return 0;
+
+	/*
+	 * if the target inode is in extent form and the temp inode is in btree
+	 * form then we will end up with the target inode in the wrong format
+	 * as we already know there are less extents in the temp inode.
+	 */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+		return -EINVAL;
+
+	/* Check temp in extent form to max in target */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+		return -EINVAL;
+
+	/* Check target in extent form to max in temp */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+		return -EINVAL;
+
+	/*
+	 * If we are in a btree format, check that the temp root block will fit
+	 * in the target and that it has enough extents to be in btree format
+	 * in the target.
+	 *
+	 * Note that we have to be careful to allow btree->extent conversions
+	 * (a common defrag case) which will occur when the temp inode is in
+	 * extent format...
+	 */
+	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_Q(ip) &&
+		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+			return -EINVAL;
+		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+			return -EINVAL;
+	}
+
+	/* Reciprocal target->temp btree format checks */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		if (XFS_IFORK_Q(tip) &&
+		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+			return -EINVAL;
+		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Fix up the owners of the bmbt blocks to refer to the current inode. The
+ * change owner scan attempts to order all modified buffers in the current
+ * transaction. In the event of ordered buffer failure, the offending buffer is
+ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
+ * the transaction in this case to replenish the fallback log reservation and
+ * restart the scan. This process repeats until the scan completes.
+ */
+static int
+xfs_swap_change_owner(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	struct xfs_inode	*tmpip)
+{
+	int			error;
+	struct xfs_trans	*tp = *tpp;
+
+	do {
+		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
+					      NULL);
+		/* success or fatal error */
+		if (error != -EAGAIN)
+			break;
+
+		error = xfs_trans_roll(tpp);
+		if (error)
+			break;
+		tp = *tpp;
+
+		/*
+		 * Redirty both inodes so they can relog and keep the log tail
+		 * moving forward.
+		 */
+		xfs_trans_ijoin(tp, ip, 0);
+		xfs_trans_ijoin(tp, tmpip, 0);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
+	} while (true);
+
+	return error;
+}
+
+/* Swap the extents of two files by swapping data forks. */
+static int
+xfs_swap_extent_forks(
+	struct xfs_trans	**tpp,
+	struct xfs_swapext_req	*req)
+{
+	struct xfs_inode	*ip = req->ip1;
+	struct xfs_inode	*tip = req->ip2;
+	xfs_filblks_t		aforkblks = 0;
+	xfs_filblks_t		taforkblks = 0;
+	int64_t			temp_blks;
+	xfs_extnum_t		junk;
+	uint64_t		tmp;
+	unsigned int		reflink_state;
+	int			src_log_flags = XFS_ILOG_CORE;
+	int			target_log_flags = XFS_ILOG_CORE;
+	int			error;
+
+	reflink_state = xfs_swapext_reflink_prep(req);
+
+	/*
+	 * Count the number of extended attribute blocks
+	 */
+	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
+				&aforkblks);
+		if (error)
+			return error;
+	}
+	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
+				&taforkblks);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Btree format (v3) inodes have the inode number stamped in the bmbt
+	 * block headers. We can't start changing the bmbt blocks until the
+	 * inode owner change is logged so recovery does the right thing in the
+	 * event of a crash. Set the owner change log flags now and leave the
+	 * bmbt scan as the last step.
+	 */
+	if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+		if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+			target_log_flags |= XFS_ILOG_DOWNER;
+		if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+			src_log_flags |= XFS_ILOG_DOWNER;
+	}
+
+	/*
+	 * Swap the data forks of the inodes
+	 */
+	swap(ip->i_df, tip->i_df);
+
+	/* Update quota accounting. */
+	temp_blks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+	xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
+			temp_blks - ip->i_d.di_nblocks);
+
+	temp_blks = ip->i_d.di_nblocks + taforkblks - aforkblks;
+	xfs_trans_mod_dquot_byino(*tpp, tip, XFS_TRANS_DQ_BCOUNT,
+			temp_blks - tip->i_d.di_nblocks);
+
+	/*
+	 * Fix the on-disk inode values
+	 */
+	tmp = (uint64_t)ip->i_d.di_nblocks;
+	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+	swap(ip->i_d.di_nextents, tip->i_d.di_nextents);
+	swap(ip->i_d.di_format, tip->i_d.di_format);
+
+	/*
+	 * The extents in the source inode could still contain speculative
+	 * preallocation beyond EOF (e.g. the file is open but not modified
+	 * while defrag is in progress). In that case, we need to copy over the
+	 * number of delalloc blocks the data fork in the source inode is
+	 * tracking beyond EOF so that when the fork is truncated away when the
+	 * temporary inode is unlinked we don't underrun the i_delayed_blks
+	 * counter on that inode.
+	 */
+	ASSERT(tip->i_delayed_blks == 0);
+	tip->i_delayed_blks = ip->i_delayed_blks;
+	ip->i_delayed_blks = 0;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		src_log_flags |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
+		       (src_log_flags & XFS_ILOG_DOWNER));
+		src_log_flags |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	switch (tip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		target_log_flags |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		target_log_flags |= XFS_ILOG_DBROOT;
+		ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
+		       (target_log_flags & XFS_ILOG_DOWNER));
+		break;
+	}
+
+	xfs_swapext_reflink_finish(*tpp, req, reflink_state);
+
+	xfs_trans_log_inode(*tpp, ip,  src_log_flags);
+	xfs_trans_log_inode(*tpp, tip, target_log_flags);
+
+	/*
+	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
+	 * have inode number owner values in the bmbt blocks that still refer to
+	 * the old inode. Scan each bmbt to fix up the owner values with the
+	 * inode number of the current inode.
+	 */
+	if (src_log_flags & XFS_ILOG_DOWNER) {
+		error = xfs_swap_change_owner(tpp, ip, tip);
+		if (error)
+			return error;
+	}
+	if (target_log_flags & XFS_ILOG_DOWNER) {
+		error = xfs_swap_change_owner(tpp, tip, ip);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
 /* Prepare two files to have their data swapped. */
 int
 xfs_swap_range_prep(
author	Darrick J. Wong <darrick.wong@oracle.com>	2020-03-30 16:00:59 -0700
committer	Darrick J. Wong <darrick.wong@oracle.com>	2020-06-01 21:16:49 -0700
commit	b616b582b3ffc42130ce4b0de0d5fbf0aedfd855 (patch)
tree	502f0544c01ba9303a05a616d1b2d4710a30a0d0
parent	558a2e12d27a76a4b6d8501f3af9415dd6178581 (diff)