xfs: allow queued AG intents to drain before scrubbingscrub-drain-intents_2021-09-17

Currently, online scrub isn't sufficiently careful about quiescing allocation groups before checking them. While scrub does take the AG header locks, it doesn't serialize against chains of AG update intents that are being processed concurrently. If there's a collision, cross-referencing between data structures (e.g. rmapbt and refcountbt) can yield false corruption events; if repair is running, this results in incorrect repairs. Fix this by adding to the perag structure the count of active intents and make scrub wait until there aren't any to continue. This is a little stupid since transactions can queue intents without taking buffer locks, but we'll also wait for those transactions. XXX: should have instead a per-ag rwsem that gets taken as soon as the AG[IF] are locked and stays held until the transaction commits or moves on to the next AG? would we rather have a six lock so that intents can take an ix lock, and not have to upgrade to x until we actually want to make changes to that ag? is that how those even work?? Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-09-01 11:25:37 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-09-17 18:55:29 -0700
commit: 47a4c4899791dd29837d1149dffb70ad18f986fa (patch)
tree: 7c249c521749662ac2ce4da1bcdf408017decd6e
parent: 803efaa936528ec6a35e4e521cddeb440236c9d0 (diff)
19 files changed, 574 insertions, 19 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index dc9d78fb7bac..5fab36a40cea 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -193,6 +193,7 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		ASSERT(atomic_read(&pag->pag_intents) == 0);
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
 		xfs_iunlink_destroy(pag);
@@ -254,6 +255,7 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pag_state_lock);
 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		init_waitqueue_head(&pag->pag_intents_wq);
 		init_waitqueue_head(&pag->pagb_wait);
 		pag->pagb_count = 0;
 		pag->pagb_tree = RB_ROOT;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index c9e198e62b74..a58b0db9ef58 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -103,6 +103,15 @@ struct xfs_perag {
 	 * or have some other means to control concurrency.
 	 */
 	struct rhashtable	pagi_unlinked_hash;
+
+	/*
+	 * Counter of live intents.  We track the number of log intent items
+	 * that have been queued (but not yet processed) so that scrub can
+	 * detect the presence of other threads that are in the middle of
+	 * processing a chain of deferred items.
+	 */
+	atomic_t		pag_intents;
+	wait_queue_head_t	pag_intents_wq;
 };
 
 int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 3045ad184972..108d950b6f41 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -363,7 +363,8 @@ xfs_defer_cancel_list(
 		list_for_each_safe(pwi, n, &dfp->dfp_work) {
 			list_del(pwi);
 			dfp->dfp_count--;
-			ops->cancel_item(pwi);
+			trace_xfs_defer_cancel_item(mp, dfp, pwi);
+			ops->cancel_item(mp, pwi);
 		}
 		ASSERT(dfp->dfp_count == 0);
 		kmem_free(dfp);
@@ -442,6 +443,7 @@ xfs_defer_finish_one(
 	list_for_each_safe(li, n, &dfp->dfp_work) {
 		list_del(li);
 		dfp->dfp_count--;
+		trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
 		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
 		if (error == -EAGAIN) {
 			/*
@@ -585,7 +587,7 @@ xfs_defer_add(
 	struct list_head		*li)
 {
 	struct xfs_defer_pending	*dfp = NULL;
-	const struct xfs_defer_op_type	*ops;
+	const struct xfs_defer_op_type	*ops = defer_op_types[type];
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
@@ -598,7 +600,6 @@ xfs_defer_add(
 	if (!list_empty(&tp->t_dfops)) {
 		dfp = list_last_entry(&tp->t_dfops,
 				struct xfs_defer_pending, dfp_list);
-		ops = defer_op_types[dfp->dfp_type];
 		if (dfp->dfp_type != type ||
 		    (ops->max_items && dfp->dfp_count >= ops->max_items))
 			dfp = NULL;
@@ -616,6 +617,8 @@ xfs_defer_add(
 	}
 
 	list_add_tail(li, &dfp->dfp_work);
+	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
+	ops->add_item(tp->t_mountp, li);
 	dfp->dfp_count++;
 }
 
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 6b25faf1bd2a..3a5411af937f 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -55,7 +55,8 @@ struct xfs_defer_op_type {
 			struct list_head *item, struct xfs_btree_cur **state);
 	void (*finish_cleanup)(struct xfs_trans *tp,
 			struct xfs_btree_cur *state, int error);
-	void (*cancel_item)(struct list_head *item);
+	void (*cancel_item)(struct xfs_mount *mp, struct list_head *item);
+	void (*add_item)(struct xfs_mount *mp, const struct list_head *item);
 	unsigned int		max_items;
 };
 
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 5b75804ad0e6..542799830530 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -333,7 +333,9 @@ xrep_bmap_scan_rt(
 	if (xrep_is_rtmeta_ino(sc, sc->ip->i_ino))
 		return 0;
 
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 	xrep_rt_btcur_init(sc, &sc->sr);
 	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
 	xchk_rt_btcur_free(&sc->sr);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 443fe6b88dd2..5b33166e633f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -486,8 +486,8 @@ want_ag_read_header_failure(
  * all the buffers we grab to the scrub transaction so they'll all be freed
  * when we cancel it.  Returns ENOENT if we can't grab the perag structure.
  */
-int
-xchk_ag_read_headers(
+static inline int
+__xchk_ag_read_headers(
 	struct xfs_scrub	*sc,
 	xfs_agnumber_t		agno,
 	struct xchk_ag		*sa)
@@ -495,11 +495,6 @@ xchk_ag_read_headers(
 	struct xfs_mount	*mp = sc->mp;
 	int			error;
 
-	ASSERT(!sa->pag);
-	sa->pag = xfs_perag_get(mp, agno);
-	if (!sa->pag)
-		return -ENOENT;
-
 	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
 		return error;
@@ -515,6 +510,87 @@ xchk_ag_read_headers(
 	return 0;
 }
 
+static inline bool
+xchk_ag_intents_pending(
+	struct xfs_perag	*pag)
+{
+	int			intents = atomic_read(&pag->pag_intents);
+
+	trace_xchk_ag_read_headers(pag->pag_mount, pag->pag_agno, intents,
+			_RET_IP_);
+
+	return intents > 0;
+}
+
+/*
+ * Grab all the headers for an AG, and wait until there aren't any pending
+ * intents.
+ */
+int
+xchk_ag_read_headers(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	ASSERT(!sa->pag);
+	sa->pag = xfs_perag_get(mp, agno);
+	if (!sa->pag)
+		return -ENOENT;
+
+	do {
+		error = __xchk_ag_read_headers(sc, agno, sa);
+		if (error)
+			break;
+
+		/*
+		 * Decide if this AG is quiet enough for all metadata to be
+		 * consistent with each other.  XFS allows the AG header buffer
+		 * locks to cycle across transaction rolls while processing
+		 * chains of deferred ops, which means that there could be
+		 * other threads in the middle of processing a chain of
+		 * deferred ops.  For regular operations we are careful about
+		 * ordering operations to prevent collisions between threads
+		 * (which is why we don't need a per-AG lock), but scrub and
+		 * repair have to serialize against chained operations.
+		 *
+		 * We just locked all the AG headers buffers; now take a look
+		 * to see if there are any intents in progress.  If there are,
+		 * drop the AG headers and wait for the intents to drain.
+		 * Since we hold all the AG header locks for the duration of
+		 * the scrub, this is the only time we have to sample the
+		 * intents counter; any threads increasing it after this point
+		 * can't possibly be in the middle of a chain of AG metadata
+		 * updates.
+		 */
+		if (!xchk_ag_intents_pending(sa->pag)) {
+			error = 0;
+			break;
+		}
+
+		if (sa->agfl_bp) {
+			xfs_trans_brelse(sc->tp, sa->agfl_bp);
+			sa->agfl_bp = NULL;
+		}
+
+		if (sa->agf_bp) {
+			xfs_trans_brelse(sc->tp, sa->agf_bp);
+			sa->agf_bp = NULL;
+		}
+
+		if (sa->agi_bp) {
+			xfs_trans_brelse(sc->tp, sa->agi_bp);
+			sa->agi_bp = NULL;
+		}
+
+		error = xfs_perag_wait_intents(sa->pag);
+	} while (!error);
+
+	return error;
+}
+
 /* Release all the AG btree cursors. */
 void
 xchk_ag_btcur_free(
@@ -641,14 +717,59 @@ xchk_ag_init(
 	return 0;
 }
 
-/* Lock everything we need to work on realtime metadata. */
-void
+#if IS_ENABLED(CONFIG_XFS_RT)
+static inline bool
+xchk_rt_intents_pending(
+	struct xfs_mount	*mp)
+{
+	int			intents = atomic_read(&mp->m_rt_intents);
+
+	trace_xchk_rt_lock(mp, -1U, intents, _RET_IP_);
+
+	return intents > 0;
+}
+#else
+# define xchk_rt_intents_pending(mp)	(false)
+#endif
+
+/* Lock everything we need to work on realtime metadata and wait for intents. */
+int
 xchk_rt_lock(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
 {
-	xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
-	sr->locked = true;
+	int			error;
+
+	do {
+		xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
+
+		/*
+		 * Decide if the RT volume is quiet enough for all metadata to
+		 * be consistent with each other.  Regular file IO doesn't get
+		 * to lock all the rt inodes at the same time, which means that
+		 * there could be other threads in the middle of processing a
+		 * chain of deferred ops.
+		 *
+		 * We just locked all the rt inodes; now take a look to see if
+		 * there are any rt intents in progress.  If there are, drop
+		 * the rt inode locks and wait for the intents to drain.  Since
+		 * we hold the rt inode locks for the duration of the scrub,
+		 * this is the only time we have to sample the intents counter;
+		 * any threads increasing it after this point can't possibly be
+		 * in the middle of a chain of rt metadata updates.
+		 */
+		if (!xchk_rt_intents_pending(sc->mp)) {
+			sr->locked = true;
+			error = 0;
+			break;
+		}
+
+		xfs_rtunlock(sc->mp, XFS_RTLOCK_ALL);
+
+		error = xfs_rt_wait_intents(sc->mp);
+	} while (!error);
+
+	return error;
 }
 
 /*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index dbd4145690b2..70a42897cd22 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -153,7 +153,7 @@ xchk_ag_init_existing(
 
 void xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_btcur_free(struct xchk_rt *sr);
-void xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 144481f94ce8..c99fe223e3d6 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -597,7 +597,9 @@ xrep_dinode_count_rt_rmaps(
 	    xrep_is_rtmeta_ino(sc, sc->sm->sm_ino))
 		return 0;
 
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 	xrep_rt_btcur_init(sc, &sc->sr);
 	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
 			dis);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index b50304ee1305..4c4680a8b41b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -482,6 +482,7 @@ xrep_newbt_schedule_reap(
 
 	INIT_LIST_HEAD(&efi_item.xefi_list);
 	list_add(&efi_item.xefi_list, &items);
+	xfs_fs_bump_intents(xnr->sc->mp, false, resv->fsbno);
 	resv->efi = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
 			&items, 1, false);
 }
@@ -712,6 +713,7 @@ xrep_newbt_destroy(
 			goto junkit;
 
 		list_del(&resv->list);
+		xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
 		kmem_free(resv);
 	}
 
@@ -724,6 +726,7 @@ junkit:
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 		xfs_extent_free_defer_type.abort_intent(resv->efi);
 		list_del(&resv->list);
+		xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
 		kmem_free(resv);
 	}
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 732a7681d691..174381167893 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -710,6 +710,37 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
 		  __entry->cluster_ino)
 )
 
+DECLARE_EVENT_CLASS(xchk_ag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u refcount %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_XCHK_AG_EVENT(name)	\
+DEFINE_EVENT(xchk_ag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_XCHK_AG_EVENT(xchk_ag_read_headers);
+DEFINE_XCHK_AG_EVENT(xchk_rt_lock);
+
 TRACE_EVENT(xchk_fscounters_calc,
 	TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
 		 uint64_t fdblocks, uint64_t delalloc),
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 2b2d09a95ed9..4eead6f75509 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -351,14 +351,30 @@ xfs_bmap_update_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_bmap_intent		*bi;
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_fsblock_t			orig_startblock;
 	int				error;
 
 	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+	orig_startblock = bi->bi_bmap.br_startblock;
 	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
 	if (!error && bi->bi_bmap.br_blockcount > 0) {
 		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
 		return -EAGAIN;
 	}
+
+	/*
+	 * Drop our intent counter reference now that we've either queued a
+	 * deferred rmap intent or failed.  Be careful to use the original
+	 * startblock since the finishing functions can update the intent
+	 * state.
+	 */
+	if (xfs_has_rmapbt(mp)) {
+		bool rt = xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork);
+
+		xfs_fs_drop_intents(mp, rt, orig_startblock);
+	}
+
 	kmem_free(bi);
 	return error;
 }
@@ -371,17 +387,47 @@ xfs_bmap_update_abort_intent(
 	xfs_bui_release(BUI_ITEM(intent));
 }
 
-/* Cancel a deferred rmap update. */
+/* Cancel a deferred bmap update. */
 STATIC void
 xfs_bmap_update_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_bmap_intent		*bi;
 
 	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	/* Drop our intent counter reference since we're going away. */
+	if (xfs_has_rmapbt(mp)) {
+		bool rt = xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork);
+
+		xfs_fs_drop_intents(mp, rt, bi->bi_bmap.br_startblock);
+	}
+
 	kmem_free(bi);
 }
 
+/* Add a deferred bmap update. */
+STATIC void
+xfs_bmap_update_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_bmap_intent	*bi;
+
+	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	/*
+	 * Grab an intent counter reference on behalf of the deferred rmap
+	 * intent item that we will queue when we finish this bmap work.
+	 */
+	if (xfs_has_rmapbt(mp)) {
+		bool rt = xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork);
+
+		xfs_fs_bump_intents(mp, rt, bi->bi_bmap.br_startblock);
+	}
+}
+
 const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
 	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_bmap_update_create_intent,
@@ -389,6 +435,7 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
 	.create_done	= xfs_bmap_update_create_done,
 	.finish_item	= xfs_bmap_update_finish_item,
 	.cancel_item	= xfs_bmap_update_cancel_item,
+	.add_item	= xfs_bmap_update_add_item,
 };
 
 /* Is this recovered BUI ok? */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 81e7a7e1533d..9f7f25a17e47 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -487,6 +487,7 @@ xfs_extent_free_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
+	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_extent_free_item	*xefi;
 	int				error;
 
@@ -502,6 +503,14 @@ xfs_extent_free_finish_item(
 	}
 
 	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+
+	/*
+	 * Drop our intent counter reference now that we've finished all the
+	 * work or failed.  The finishing function doesn't update the intent
+	 * state, so we need not preserve the original startblock.
+	 */
+	xfs_fs_drop_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
+
 	kmem_free(xefi);
 	return error;
 }
@@ -517,14 +526,30 @@ xfs_extent_free_abort_intent(
 /* Cancel a free extent. */
 STATIC void
 xfs_extent_free_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_extent_free_item	*xefi;
 
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+	xfs_fs_drop_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
 	kmem_free(xefi);
 }
 
+/* Add a deferred free extent. */
+STATIC void
+xfs_extent_free_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_extent_free_item *xefi;
+
+	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+	/* Grab an intent counter reference for this intent item. */
+	xfs_fs_bump_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
+}
+
 const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_extent_free_create_intent,
@@ -532,6 +557,7 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.create_done	= xfs_extent_free_create_done,
 	.finish_item	= xfs_extent_free_finish_item,
 	.cancel_item	= xfs_extent_free_cancel_item,
+	.add_item	= xfs_extent_free_add_item,
 };
 
 /*
@@ -585,6 +611,8 @@ xfs_agfl_free_finish_item(
 	extp->ext_len = xefi->xefi_blockcount;
 	efdp->efd_next_extent++;
 
+	xfs_fs_drop_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
+
 	kmem_free(xefi);
 	return error;
 }
@@ -597,6 +625,7 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
 	.create_done	= xfs_extent_free_create_done,
 	.finish_item	= xfs_agfl_free_finish_item,
 	.cancel_item	= xfs_extent_free_cancel_item,
+	.add_item	= xfs_extent_free_add_item,
 };
 
 /* Is this recovered EFI ok? */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 92cd5087958e..fdda9fbd638e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1465,3 +1465,99 @@ xfs_hook_call(
 {
 	return srcu_notifier_call_chain(&chain->head, val, priv);
 }
+
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+
+#if IS_ENABLED(CONFIG_XFS_RT)
+static inline void
+xfs_rt_bump_intents(
+	struct xfs_mount	*mp)
+{
+	trace_xfs_rt_bump_intents(mp, __return_address);
+
+	atomic_inc(&mp->m_rt_intents);
+}
+
+static inline void
+xfs_rt_drop_intents(
+	struct xfs_mount	*mp)
+{
+	trace_xfs_rt_drop_intents(mp, __return_address);
+
+	ASSERT(atomic_read(&mp->m_rt_intents) > 0);
+
+	if (atomic_dec_and_test(&mp->m_rt_intents))
+		wake_up(&mp->m_rt_intents_wq);
+}
+
+int
+xfs_rt_wait_intents(
+	struct xfs_mount	*mp)
+{
+	trace_xfs_rt_wait_intents(mp, __return_address);
+
+	return wait_event_killable(mp->m_rt_intents_wq,
+			atomic_read(&mp->m_rt_intents) == 0);
+}
+#else
+static inline void xfs_rt_bump_intents(struct xfs_mount *mp) { }
+static inline void xfs_rt_drop_intents(struct xfs_mount *mp) { }
+#endif /* CONFIG_XFS_RT */
+
+static inline void
+xfs_ag_bump_intents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
+	trace_xfs_perag_bump_intents(pag, __return_address);
+
+	atomic_inc(&pag->pag_intents);
+	xfs_perag_put(pag);
+}
+
+static inline void
+xfs_ag_drop_intents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
+	trace_xfs_perag_drop_intents(pag, __return_address);
+
+	ASSERT(atomic_read(&pag->pag_intents) > 0);
+
+	if (atomic_dec_and_test(&pag->pag_intents))
+		wake_up(&pag->pag_intents_wq);
+	xfs_perag_put(pag);
+}
+
+void
+xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb)
+{
+	if (isrt)
+		xfs_rt_bump_intents(mp);
+	else
+		xfs_ag_bump_intents(mp, XFS_FSB_TO_AGNO(mp, fsb));
+}
+
+void
+xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb)
+{
+	if (isrt)
+		xfs_rt_drop_intents(mp);
+	else
+		xfs_ag_drop_intents(mp, XFS_FSB_TO_AGNO(mp, fsb));
+}
+
+int
+xfs_perag_wait_intents(
+	struct xfs_perag	*pag)
+{
+	trace_xfs_perag_wait_intents(pag, __return_address);
+
+	return wait_event_killable(pag->pag_intents_wq,
+			atomic_read(&pag->pag_intents) == 0);
+}
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a161aabd8438..62b8fe194219 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -263,6 +263,17 @@ typedef struct xfs_mount {
 	/* online nlink check stuff */
 	struct xfs_hook_chain	m_nlink_mod_hooks;
 #endif
+
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+	/*
+	 * Counter of live intents.  We track the number of log intent items
+	 * that have been queued (but not yet processed) so that scrub can
+	 * detect the presence of other threads that are in the middle of
+	 * processing a chain of deferred items.
+	 */
+	atomic_t		m_rt_intents;
+	wait_queue_head_t	m_rt_intents_wq;
+#endif
 } xfs_mount_t;
 
 /* Parameters for xfs_bumplink/droplink hook. */
@@ -580,4 +591,26 @@ int xfs_hook_add(struct xfs_hook_chain *chain, struct notifier_block *hook,
 void xfs_hook_del(struct xfs_hook_chain *chain, struct notifier_block *hook);
 int xfs_hook_call(struct xfs_hook_chain *chain, unsigned long val, void *priv);
 
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+# if IS_ENABLED(CONFIG_XFS_RT)
+int xfs_rt_wait_intents(struct xfs_mount *mp);
+# else
+#  define xfs_rt_wait_intents(mp)		(-ENOSYS)
+# endif /* CONFIG_XFS_RT */
+
+void xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb);
+void xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb);
+int xfs_perag_wait_intents(struct xfs_perag *pag);
+
+#else
+static inline void
+xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb) { }
+static inline void
+xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb) { }
+
+int xfs_perag_wait_intents(struct xfs_perag *pag);
+# define xfs_perag_wait_intents(pag)		(-ENOSYS)
+# define xfs_rt_wait_intents(mp)		(-ENOSYS)
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
 #endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 006bd1086e50..f9d67cdf9c5d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -364,9 +364,12 @@ xfs_refcount_update_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_refcount_intent	*ri;
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_fsblock_t			orig_startblock;
 	int				error;
 
 	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	orig_startblock = ri->ri_startblock;
 	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
 			state);
 
@@ -376,6 +379,13 @@ xfs_refcount_update_finish_item(
 		       ri->ri_type == XFS_REFCOUNT_DECREASE);
 		return -EAGAIN;
 	}
+
+	/*
+	 * Drop our intent counter reference now that we've finished all the
+	 * work or failed.  Be careful to use the original startblock because
+	 * the finishing functions can update the intent state.
+	 */
+	xfs_fs_drop_intents(mp, ri->ri_realtime, orig_startblock);
 	kmem_free(ri);
 	return error;
 }
@@ -391,14 +401,29 @@ xfs_refcount_update_abort_intent(
 /* Cancel a deferred refcount update. */
 STATIC void
 xfs_refcount_update_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_refcount_intent	*ri;
 
 	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	xfs_fs_drop_intents(mp, ri->ri_realtime, ri->ri_startblock);
 	kmem_free(ri);
 }
 
+/* Add a deferred refcount update. */
+STATIC void
+xfs_refcount_update_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_refcount_intent *ri;
+
+	/* Grab an intent counter reference for this intent item. */
+	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	xfs_fs_bump_intents(mp, ri->ri_realtime, ri->ri_startblock);
+}
+
 const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_refcount_update_create_intent,
@@ -407,6 +432,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.finish_item	= xfs_refcount_update_finish_item,
 	.finish_cleanup = xfs_refcount_finish_one_cleanup,
 	.cancel_item	= xfs_refcount_update_cancel_item,
+	.add_item	= xfs_refcount_update_add_item,
 };
 
 /* Is this recovered CUI ok? */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 6f115ca55746..5a3953088b5d 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -411,11 +411,19 @@ xfs_rmap_update_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_rmap_intent		*ri;
+	struct xfs_mount		*mp = tp->t_mountp;
 	int				error;
 
 	ri = container_of(item, struct xfs_rmap_intent, ri_list);
 	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
 			state);
+
+	/*
+	 * Drop our intent counter reference now that we've finished all the
+	 * work or failed.  The finishing function doesn't update the intent
+	 * state, so we need not preserve the original startblock.
+	 */
+	xfs_fs_drop_intents(mp, ri->ri_realtime, ri->ri_bmap.br_startblock);
 	kmem_free(ri);
 	return error;
 }
@@ -431,14 +439,30 @@ xfs_rmap_update_abort_intent(
 /* Cancel a deferred rmap update. */
 STATIC void
 xfs_rmap_update_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_rmap_intent		*ri;
 
 	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+	xfs_fs_drop_intents(mp, ri->ri_realtime, ri->ri_bmap.br_startblock);
 	kmem_free(ri);
 }
 
+/* Add a deferred rmap update. */
+STATIC void
+xfs_rmap_update_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_rmap_intent	*ri;
+
+	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+
+	/* Grab an intent counter reference for this intent item. */
+	xfs_fs_bump_intents(mp, ri->ri_realtime, ri->ri_bmap.br_startblock);
+}
+
 const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_rmap_update_create_intent,
@@ -447,6 +471,7 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.finish_item	= xfs_rmap_update_finish_item,
 	.finish_cleanup = xfs_rmap_finish_one_cleanup,
 	.cancel_item	= xfs_rmap_update_cancel_item,
+	.add_item	= xfs_rmap_update_add_item,
 };
 
 /* Is this recovered RUI ok? */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d8c03d84b78a..a0e47df75444 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -725,6 +725,9 @@ xfs_mount_free(
 
 	ASSERT(!mutex_is_locked(&mp->m_scrub_freeze));
 	mutex_destroy(&mp->m_scrub_freeze);
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+	ASSERT(atomic_read(&mp->m_rt_intents) == 0);
+#endif
 
 	kmem_free(mp);
 }
@@ -1977,6 +1980,10 @@ static int xfs_init_fs_context(
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+	init_waitqueue_head(&mp->m_rt_intents_wq);
+	atomic_set(&mp->m_rt_intents, 0);
+#endif
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
 	 * recovery, so we must set this to true so that an ifree transaction
diff --git a/fs/xfs/xfs_swapext_item.c b/fs/xfs/xfs_swapext_item.c
index 7c29e4c9737a..08fb0d514d69 100644
--- a/fs/xfs/xfs_swapext_item.c
+++ b/fs/xfs/xfs_swapext_item.c
@@ -346,6 +346,7 @@ xfs_swapext_abort_intent(
 /* Cancel a deferred swapext update. */
 STATIC void
 xfs_swapext_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_swapext_intent	*sxi;
@@ -354,6 +355,14 @@ xfs_swapext_cancel_item(
 	kmem_free(sxi);
 }
 
+/* Add a deferred swapext update. */
+STATIC void
+xfs_swapext_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+}
+
 const struct xfs_defer_op_type xfs_swapext_defer_type = {
 	.max_items	= XFS_SXI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_swapext_create_intent,
@@ -361,6 +370,7 @@ const struct xfs_defer_op_type xfs_swapext_defer_type = {
 	.create_done	= xfs_swapext_create_done,
 	.finish_item	= xfs_swapext_finish_item,
 	.cancel_item	= xfs_swapext_cancel_item,
+	.add_item	= xfs_swapext_add_item,
 };
 
 /* Is this recovered SXI ok? */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7879e11115b8..c3b3c18dc3fb 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -271,6 +271,8 @@ DEFINE_FS_EVENT(xfs_blockgc_start);
 DEFINE_FS_EVENT(xfs_blockgc_stop);
 DEFINE_FS_EVENT(xfs_blockgc_worker);
 DEFINE_FS_EVENT(xfs_blockgc_flush_all);
+DEFINE_FS_EVENT(xfs_force_shutdown1);
+DEFINE_FS_EVENT(xfs_force_shutdown2);
 
 TRACE_EVENT(xfs_inodegc_shrinker_scan,
 	TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc,
@@ -2748,6 +2750,44 @@ DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
 DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer);
 DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred);
 
+DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
+		 void *item),
+	TP_ARGS(mp, dfp, item),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(void *, intent)
+		__field(void *, item)
+		__field(char, committed)
+		__field(int, nr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp ? mp->m_super->s_dev : 0;
+		__entry->type = dfp->dfp_type;
+		__entry->intent = dfp->dfp_intent;
+		__entry->item = item;
+		__entry->committed = dfp->dfp_done != NULL;
+		__entry->nr = dfp->dfp_count;
+	),
+	TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->intent,
+		  __entry->item,
+		  __entry->committed,
+		  __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_item_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \
+		 void *item), \
+	TP_ARGS(mp, dfp, item))
+
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
+
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
 	TP_PROTO(struct xfs_btree_cur *cur,
@@ -4893,6 +4933,74 @@ DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free_extent);
 DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_critical);
 DEFINE_INODE_ERROR_EVENT(xfs_imeta_resv_init_error);
 
+DECLARE_EVENT_CLASS(xfs_perag_intents_class,
+	TP_PROTO(struct xfs_perag *pag, void *caller_ip),
+	TP_ARGS(pag, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(long, nr_intents)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+		__entry->nr_intents = atomic_read(&pag->pag_intents);
+#else
+		__entry->nr_intents = -1;
+#endif
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->nr_intents,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_PERAG_INTENTS_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_intents_class, name,					\
+	TP_PROTO(struct xfs_perag *pag, void *caller_ip), \
+	TP_ARGS(pag, caller_ip))
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_bump_intents);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_drop_intents);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+
+DECLARE_EVENT_CLASS(xfs_rt_intents_class,
+	TP_PROTO(struct xfs_mount *mp, void *caller_ip),
+	TP_ARGS(mp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(long, nr_intents)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+		__entry->nr_intents = atomic_read(&mp->m_rt_intents);
+#else
+		__entry->nr_intents = -1;
+#endif
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d intents %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->nr_intents,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_RT_INTENTS_EVENT(name)	\
+DEFINE_EVENT(xfs_rt_intents_class, name,					\
+	TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+	TP_ARGS(mp, caller_ip))
+DEFINE_RT_INTENTS_EVENT(xfs_rt_bump_intents);
+DEFINE_RT_INTENTS_EVENT(xfs_rt_drop_intents);
+DEFINE_RT_INTENTS_EVENT(xfs_rt_wait_intents);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
author	Darrick J. Wong <djwong@kernel.org>	2021-09-01 11:25:37 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-09-17 18:55:29 -0700
commit	47a4c4899791dd29837d1149dffb70ad18f986fa (patch)
tree	7c249c521749662ac2ce4da1bcdf408017decd6e
parent	803efaa936528ec6a35e4e521cddeb440236c9d0 (diff)