xfs: log EFIs for all btree blocks being used to stage a btree

We need to log EFIs for every extent that we allocate for the purpose of staging a new btree so that if we fail then the blocks will be freed during log recovery. Add a function to relog the EFIs, so that repair can relog them all every time it creates a new btree block, which will help us to avoid pinning the log tail. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2022-07-14 11:05:59 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2022-10-14 14:16:36 -0700
commit: 15fc33594bf2f82034cd8818aeeae93ba537bf64 (patch)
tree: 50e600145680fe5c0e326fb6cd97a9785119a0ce
parent: b18e03b42b4f30eb8963f75be9bba3b74708c00d (diff)
4 files changed, 195 insertions, 4 deletions
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 8adbee26f2f9..c7f23e501095 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -13,12 +13,14 @@
 #include "xfs_btree_staging.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
+#include "xfs_log.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -128,14 +130,150 @@ xrep_newbt_init_bare(
 			XFS_AG_RESV_NONE);
 }
 
-/* Designate specific blocks to be used to build our new btree. */
+/*
+ * Set up automatic reaping of the blocks reserved for btree reconstruction in
+ * case we crash by logging a deferred free item for each extent we allocate so
+ * that we can get all of the space back if we crash before we can commit the
+ * new btree.  This function returns a token that can be used to cancel
+ * automatic reaping if repair is successful.
+ */
+static int
+xrep_newbt_schedule_autoreap(
+	struct xrep_newbt		*xnr,
+	struct xrep_newbt_resv		*resv)
+{
+	struct xfs_extent_free_item	efi_item = {
+		.xefi_startblock	= resv->fsbno,
+		.xefi_blockcount	= resv->len,
+		.xefi_owner		= xnr->oinfo.oi_owner,
+		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
+	};
+	struct xfs_log_item		*lip;
+	LIST_HEAD(items);
+
+	ASSERT(xnr->oinfo.oi_offset == 0);
+
+	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
+	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
+
+	INIT_LIST_HEAD(&efi_item.xefi_list);
+	list_add(&efi_item.xefi_list, &items);
+	xfs_fs_bump_intents(xnr->sc->mp, resv->fsbno);
+	lip = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
+			&items, 1, false);
+	if (!lip) {
+		ASSERT(0);
+		xfs_fs_drop_intents(xnr->sc->mp, resv->fsbno);
+		return -EFSCORRUPTED;
+	}
+	if (IS_ERR(lip)) {
+		xfs_fs_drop_intents(xnr->sc->mp, resv->fsbno);
+		return PTR_ERR(lip);
+	}
+
+	resv->efi = lip;
+	return 0;
+}
+
+/*
+ * Earlier, we logged EFIs for the extents that we allocated to hold the new
+ * btree so that we could automatically roll back those allocations if the
+ * system crashed.  Now we log an EFD to cancel the EFI, either because the
+ * repair succeeded and the new blocks are in use; or because the repair was
+ * cancelled and we're about to free the extents directly.
+ */
+static inline void
+xrep_newbt_finish_autoreap(
+	struct xfs_scrub	*sc,
+	struct xrep_newbt_resv	*resv)
+{
+	struct xfs_efd_log_item	*efdp;
+	struct xfs_extent	*extp;
+	struct xfs_log_item	*efd_lip;
+
+	efd_lip = xfs_extent_free_defer_type.create_done(sc->tp, resv->efi, 1);
+	efdp = container_of(efd_lip, struct xfs_efd_log_item, efd_item);
+	extp = efdp->efd_format.efd_extents;
+	extp->ext_start = resv->fsbno;
+	extp->ext_len = resv->len;
+	efdp->efd_next_extent++;
+	set_bit(XFS_LI_DIRTY, &efd_lip->li_flags);
+}
+
+/* Abort an EFI logged for a new btree block reservation. */
+static inline void
+xrep_newbt_cancel_autoreap(
+	struct xrep_newbt_resv	*resv)
+{
+	xfs_extent_free_defer_type.abort_intent(resv->efi);
+}
+
+/*
+ * Relog the EFIs attached to a staging btree so that we don't pin the log
+ * tail.  Same logic as xfs_defer_relog.
+ */
 int
-xrep_newbt_add_blocks(
+xrep_newbt_relog_autoreap(
+	struct xrep_newbt	*xnr)
+{
+	struct xrep_newbt_resv	*resv;
+	unsigned int		efi_bytes = 0;
+
+	list_for_each_entry(resv, &xnr->resv_list, list) {
+		/*
+		 * If the log intent item for this deferred op is in a
+		 * different checkpoint, relog it to keep the log tail moving
+		 * forward.  We're ok with this being racy because an incorrect
+		 * decision means we'll be a little slower at pushing the tail.
+		 */
+		if (!resv->efi || xfs_log_item_in_current_chkpt(resv->efi))
+			continue;
+
+		resv->efi = xfs_trans_item_relog(resv->efi, xnr->sc->tp);
+
+		/*
+		 * If free space is very fragmented, it's possible that the new
+		 * btree will be allocated a large number of small extents.
+		 * On an active system, it's possible that so many of those
+		 * EFIs will need relogging here that doing them all in one
+		 * transaction will overflow the reservation.
+		 *
+		 * Each allocation for the new btree (xrep_newbt_resv) points
+		 * to a unique single-mapping EFI, so each relog operation logs
+		 * a single-mapping EFD followed by a new EFI.  Each single
+		 * mapping EF[ID] item consumes about 128 bytes, so we'll
+		 * assume 256 bytes per relog.  Roll if we consume more than
+		 * half of the transaction reservation.
+		 */
+		efi_bytes += 256;
+		if (efi_bytes > xnr->sc->tp->t_log_res / 2) {
+			int	error;
+
+			error = xrep_roll_trans(xnr->sc);
+			if (error)
+				return error;
+
+			efi_bytes = 0;
+		}
+	}
+
+	if (xnr->sc->tp->t_flags & XFS_TRANS_DIRTY)
+		return xrep_roll_trans(xnr->sc);
+	return 0;
+}
+
+/* Designate specific blocks to be used to build our new btree. */
+static int
+__xrep_newbt_add_blocks(
 	struct xrep_newbt		*xnr,
 	xfs_fsblock_t			fsbno,
-	xfs_extlen_t			len)
+	xfs_extlen_t			len,
+	bool				auto_reap)
 {
 	struct xrep_newbt_resv		*resv;
+	int				error;
 
 	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
 	if (!resv)
@@ -145,10 +283,31 @@ xrep_newbt_add_blocks(
 	resv->fsbno = fsbno;
 	resv->len = len;
 	resv->used = 0;
+	if (auto_reap) {
+		error = xrep_newbt_schedule_autoreap(xnr, resv);
+		if (error) {
+			kfree(resv);
+			return error;
+		}
+	}
+
 	list_add_tail(&resv->list, &xnr->resv_list);
 	return 0;
 }
 
+/*
+ * Allow certain callers to add disk space directly to the reservation.
+ * Callers are responsible for cleaning up the reservations.
+ */
+int
+xrep_newbt_add_blocks(
+	struct xrep_newbt		*xnr,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	return __xrep_newbt_add_blocks(xnr, fsbno, len, false);
+}
+
 /* Allocate disk space for our new btree. */
 int
 xrep_newbt_alloc_blocks(
@@ -190,7 +349,8 @@ xrep_newbt_alloc_blocks(
 				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
 				args.len, xnr->oinfo.oi_owner);
 
-		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len);
+		error = __xrep_newbt_add_blocks(xnr, args.fsbno, args.len,
+				true);
 		if (error)
 			return error;
 
@@ -219,6 +379,8 @@ xrep_newbt_free_resv(
 	 * reservations.
 	 */
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		xrep_newbt_cancel_autoreap(resv);
+		xfs_fs_drop_intents(sc->mp, resv->fsbno);
 		list_del(&resv->list);
 		kfree(resv);
 	}
@@ -242,6 +404,8 @@ xrep_newbt_cancel_resv(
 {
 	struct xfs_scrub	*sc = xnr->sc;
 
+	xrep_newbt_finish_autoreap(sc, resv);
+
 	trace_xrep_newbt_cancel_blocks(sc->mp,
 			XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
 			XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
@@ -249,6 +413,9 @@ xrep_newbt_cancel_resv(
 
 	__xfs_free_extent_later(sc->tp, resv->fsbno, resv->len,
 			&xnr->oinfo, true);
+
+	/* Drop the intent drain after we commit the new item. */
+	xfs_fs_drop_intents(sc->mp, resv->fsbno);
 }
 
 /*
@@ -313,6 +480,9 @@ xrep_newbt_destroy_resv(
 	struct xrep_newbt_resv	*resv)
 {
 	struct xfs_scrub	*sc = xnr->sc;
+	xfs_fsblock_t		fsbno = resv->fsbno;
+
+	xrep_newbt_finish_autoreap(sc, resv);
 
 	/*
 	 * Use the deferred freeing mechanism to schedule for deletion any
@@ -336,6 +506,13 @@ xrep_newbt_destroy_resv(
 		__xfs_free_extent_later(sc->tp, resv->fsbno, resv->len,
 				&xnr->oinfo, true);
 	}
+
+	/*
+	 * Drop the intent drain after we commit the new item.  Use the
+	 * original fsbno from the reservation because destroying the
+	 * reservation consumes resv->fsbno.
+	 */
+	xfs_fs_drop_intents(sc->mp, fsbno);
 }
 
 /* Free all the accounting info and disk space we reserved for a new btree. */
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 0de4452aac0a..6e4f9987c2a1 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -10,6 +10,8 @@ struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
 
+	struct xfs_log_item	*efi;
+
 	/* FSB of the block we reserved. */
 	xfs_fsblock_t		fsbno;
 
@@ -58,5 +60,6 @@ void xrep_newbt_cancel(struct xrep_newbt *xnr);
 int xrep_newbt_destroy(struct xrep_newbt *xnr);
 int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
 		union xfs_btree_ptr *ptr);
+int xrep_newbt_relog_autoreap(struct xrep_newbt *xnr);
 
 #endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 19d36266eb52..650d6e6c1ab1 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -166,6 +166,16 @@ xrep_roll_ag_trans(
 	return 0;
 }
 
+/* Roll the scrub transaction, holding the primary metadata locked. */
+int
+xrep_roll_trans(
+	struct xfs_scrub	*sc)
+{
+	if (!sc->ip)
+		return xrep_roll_ag_trans(sc);
+	return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
 /* Finish all deferred work attached to the repair transaction. */
 int
 xrep_defer_finish(
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index a0df121e6866..3179746a063e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -20,6 +20,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
 int xrep_attempt(struct xfs_scrub *sc);
 void xrep_failure(struct xfs_mount *mp);
 int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_roll_trans(struct xfs_scrub *sc);
 int xrep_defer_finish(struct xfs_scrub *sc);
 bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);
author	Darrick J. Wong <djwong@kernel.org>	2022-07-14 11:05:59 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2022-10-14 14:16:36 -0700
commit	15fc33594bf2f82034cd8818aeeae93ba537bf64 (patch)
tree	50e600145680fe5c0e326fb6cd97a9785119a0ce
parent	b18e03b42b4f30eb8963f75be9bba3b74708c00d (diff)