xfs: log EFIs for all btree blocks being used to stage a btreerepair-prep-for-bulk-loading_2020-10-26

We need to log EFIs for every extent that we allocate for the purpose of staging a new btree so that if we fail then the blocks will be freed during log recovery. Add a function to relog the EFIs, so that repair can relog them all every time it creates a new btree block, which will help us to avoid pinning the log tail. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
author: Darrick J. Wong <darrick.wong@oracle.com> 2020-10-25 17:14:33 -0700
committer: Darrick J. Wong <darrick.wong@oracle.com> 2020-10-26 18:32:13 -0700
commit: e8c543a453e230452e1b7878a8e8f79fce749f13 (patch)
tree: f7c28a11ae6b3058fb94c8ee9c50bc40d37069e8
parent: 28ba8668d5591ca9ed0e511c4fd8f2e234b27f4b (diff)
3 files changed, 95 insertions, 6 deletions
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 8d6ea2641c05..413d2defbd8c 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -13,6 +13,7 @@
 #include "xfs_btree_staging.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
+#include "xfs_log.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
@@ -26,6 +27,8 @@
 #include "xfs_ag_resv.h"
 #include "xfs_quota.h"
 #include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -422,12 +425,39 @@ xrep_newbt_init_bare(
 			XFS_AG_RESV_NONE);
 }
 
+/*
+ * Set up automatic reaping of the blocks reserved for btree reconstruction in
+ * case we crash by logging a deferred free item for each extent we allocate so
+ * that we can get all of the space back if we crash before we can commit the
+ * new btree.  This function returns a token that can be used to cancel
+ * automatic reaping if repair is successful.
+ */
+static void
+xrep_newbt_schedule_reap(
+	struct xrep_newbt		*xnr,
+	struct xrep_newbt_resv		*resv)
+{
+	struct xfs_extent_free_item	efi_item = {
+		.xefi_startblock	= resv->fsbno,
+		.xefi_blockcount	= resv->len,
+		.xefi_oinfo		= xnr->oinfo, /* struct copy */
+		.xefi_skip_discard	= true,
+	};
+	LIST_HEAD(items);
+
+	INIT_LIST_HEAD(&efi_item.xefi_list);
+	list_add(&efi_item.xefi_list, &items);
+	resv->efi = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
+			&items, 1, false);
+}
+
 /* Designate specific blocks to be used to build our new btree. */
-int
-xrep_newbt_add_blocks(
+static int
+__xrep_newbt_add_blocks(
 	struct xrep_newbt		*xnr,
 	xfs_fsblock_t			fsbno,
-	xfs_extlen_t			len)
+	xfs_extlen_t			len,
+	bool				auto_reap)
 {
 	struct xrep_newbt_resv		*resv;
 
@@ -439,10 +469,25 @@ xrep_newbt_add_blocks(
 	resv->fsbno = fsbno;
 	resv->len = len;
 	resv->used = 0;
+	if (auto_reap)
+		xrep_newbt_schedule_reap(xnr, resv);
 	list_add_tail(&resv->list, &xnr->resv_list);
 	return 0;
 }
 
+/*
+ * Allow certain callers to add disk space directly to the reservation.
+ * Callers are responsible for cleaning up the reservations.
+ */
+int
+xrep_newbt_add_blocks(
+	struct xrep_newbt		*xnr,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	return __xrep_newbt_add_blocks(xnr, fsbno, len, false);
+}
+
 /* Allocate disk space for our new btree. */
 int
 xrep_newbt_alloc_blocks(
@@ -484,7 +529,8 @@ xrep_newbt_alloc_blocks(
 				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
 				args.len, xnr->oinfo.oi_owner);
 
-		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len);
+		error = __xrep_newbt_add_blocks(xnr, args.fsbno, args.len,
+				true);
 		if (error)
 			return error;
 
@@ -500,6 +546,35 @@ xrep_newbt_alloc_blocks(
 }
 
 /*
+ * Relog the EFIs attached to a staging btree so that we don't pin the log
+ * tail.  Same logic as xfs_defer_relog.
+ */
+int
+xrep_newbt_relog_efis(
+	struct xrep_newbt	*xnr)
+{
+	struct xrep_newbt_resv	*resv;
+	struct xfs_trans	*tp = xnr->sc->tp;
+
+	list_for_each_entry(resv, &xnr->resv_list, list) {
+		/*
+		 * If the log intent item for this deferred op is in a
+		 * different checkpoint, relog it to keep the log tail moving
+		 * forward.  We're ok with this being racy because an incorrect
+		 * decision means we'll be a little slower at pushing the tail.
+		 */
+		if (!resv->efi || xfs_log_item_in_current_chkpt(resv->efi))
+			continue;
+
+		resv->efi = xfs_trans_item_relog(resv->efi, tp);
+	}
+
+	if (tp->t_flags & XFS_TRANS_DIRTY)
+		return xrep_roll_trans(xnr->sc);
+	return 0;
+}
+
+/*
  * Release blocks that were reserved for a btree repair.  If the repair
  * succeeded then we log deferred frees for unused blocks.  Otherwise, we try
  * to free the extents immediately to roll the filesystem back to where it was
@@ -512,6 +587,18 @@ xrep_newbt_destroy_reservation(
 	bool			cancel_repair)
 {
 	struct xfs_scrub	*sc = xnr->sc;
+	struct xfs_log_item	*lip;
+
+	/*
+	 * Earlier, we logged EFIs for the extents that we allocated to hold
+	 * the new btree so that we could automatically roll back those
+	 * allocations if the system crashed.  Now we log an EFD to cancel the
+	 * EFI, either because the repair succeeded and the new blocks are in
+	 * use; or because the repair was cancelled and we're about to free
+	 * the extents directly.
+	 */
+	lip = xfs_extent_free_defer_type.create_done(sc->tp, resv->efi, 0);
+	set_bit(XFS_LI_DIRTY, &lip->li_flags);
 
 	if (cancel_repair) {
 		int		error;
@@ -580,6 +667,7 @@ junkit:
 	 * reservations.
 	 */
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		xfs_extent_free_defer_type.abort_intent(resv->efi);
 		list_del(&resv->list);
 		kmem_free(resv);
 	}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 8ed73a2934cb..9563aef63956 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -70,6 +70,8 @@ struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
 
+	struct xfs_log_item	*efi;
+
 	/* FSB of the block we reserved. */
 	xfs_fsblock_t		fsbno;
 
@@ -116,6 +118,7 @@ int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
 		union xfs_btree_ptr *ptr);
 void xrep_bload_estimate_slack(struct xfs_scrub *sc,
 		struct xfs_btree_bload *bload);
+int xrep_newbt_relog_efis(struct xrep_newbt *xnr);
 
 #else
 
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6c11bfc3d452..429eca33399c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -326,8 +326,6 @@ xfs_trans_get_efd(
 {
 	struct xfs_efd_log_item		*efdp;
 
-	ASSERT(nextents > 0);
-
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
 				(nextents - 1) * sizeof(struct xfs_extent),
author	Darrick J. Wong <darrick.wong@oracle.com>	2020-10-25 17:14:33 -0700
committer	Darrick J. Wong <darrick.wong@oracle.com>	2020-10-26 18:32:13 -0700
commit	e8c543a453e230452e1b7878a8e8f79fce749f13 (patch)
tree	f7c28a11ae6b3058fb94c8ee9c50bc40d37069e8
parent	28ba8668d5591ca9ed0e511c4fd8f2e234b27f4b (diff)