xfs: allow queued AG intents to drain before scrubbingscrub-drain-intents_2021-10-22

Currently, online scrub isn't sufficiently careful about quiescing allocation groups before checking them. While scrub does take the AG header locks, it doesn't serialize against chains of AG update intents that are being processed concurrently. If there's a collision, cross-referencing between data structures (e.g. rmapbt and refcountbt) can yield false corruption events; if repair is running, this results in incorrect repairs. Fix this by adding to the perag structure the count of active intents and make scrub wait until there aren't any to continue. This is a little stupid since transactions can queue intents without taking buffer locks, but we'll also wait for those transactions. XXX: should have instead a per-ag rwsem that gets taken as soon as the AG[IF] are locked and stays held until the transaction commits or moves on to the next AG? would we rather have a six lock so that intents can take an ix lock, and not have to upgrade to x until we actually want to make changes to that ag? is that how those even work?? Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-10-22 15:31:05 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-10-22 16:41:15 -0700
commit: b9872a3e43dcb62d05ee10f93ce45940e0674487 (patch)
tree: d944f29a906b713c507b29d47cd9137858ab9df8
parent: c5355cbaca02979360a5f1227ae3c4971222dc3d (diff)
20 files changed, 597 insertions, 28 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index dc9d78fb7bac..b6a9f6dde55a 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -193,6 +193,9 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+#ifdef CONFIG_XFS_ONLINE_SCRUB
+		ASSERT(atomic_read(&pag->pag_intents) == 0);
+#endif
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
 		xfs_iunlink_destroy(pag);
@@ -254,6 +257,9 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pag_state_lock);
 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+#ifdef CONFIG_XFS_ONLINE_SCRUB
+		init_waitqueue_head(&pag->pag_intents_wq);
+#endif
 		init_waitqueue_head(&pag->pagb_wait);
 		pag->pagb_count = 0;
 		pag->pagb_tree = RB_ROOT;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index c9e198e62b74..cfc51a5af74a 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -103,6 +103,17 @@ struct xfs_perag {
 	 * or have some other means to control concurrency.
 	 */
 	struct rhashtable	pagi_unlinked_hash;
+
+#ifdef CONFIG_XFS_ONLINE_SCRUB
+	/*
+	 * Counter of live intents.  We track the number of log intent items
+	 * that have been queued (but not yet processed) so that scrub can
+	 * detect the presence of other threads that are in the middle of
+	 * processing a chain of deferred items.
+	 */
+	atomic_t		pag_intents;
+	wait_queue_head_t	pag_intents_wq;
+#endif
 };
 
 int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 3045ad184972..108d950b6f41 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -363,7 +363,8 @@ xfs_defer_cancel_list(
 		list_for_each_safe(pwi, n, &dfp->dfp_work) {
 			list_del(pwi);
 			dfp->dfp_count--;
-			ops->cancel_item(pwi);
+			trace_xfs_defer_cancel_item(mp, dfp, pwi);
+			ops->cancel_item(mp, pwi);
 		}
 		ASSERT(dfp->dfp_count == 0);
 		kmem_free(dfp);
@@ -442,6 +443,7 @@ xfs_defer_finish_one(
 	list_for_each_safe(li, n, &dfp->dfp_work) {
 		list_del(li);
 		dfp->dfp_count--;
+		trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
 		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
 		if (error == -EAGAIN) {
 			/*
@@ -585,7 +587,7 @@ xfs_defer_add(
 	struct list_head		*li)
 {
 	struct xfs_defer_pending	*dfp = NULL;
-	const struct xfs_defer_op_type	*ops;
+	const struct xfs_defer_op_type	*ops = defer_op_types[type];
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
@@ -598,7 +600,6 @@ xfs_defer_add(
 	if (!list_empty(&tp->t_dfops)) {
 		dfp = list_last_entry(&tp->t_dfops,
 				struct xfs_defer_pending, dfp_list);
-		ops = defer_op_types[dfp->dfp_type];
 		if (dfp->dfp_type != type ||
 		    (ops->max_items && dfp->dfp_count >= ops->max_items))
 			dfp = NULL;
@@ -616,6 +617,8 @@ xfs_defer_add(
 	}
 
 	list_add_tail(li, &dfp->dfp_work);
+	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
+	ops->add_item(tp->t_mountp, li);
 	dfp->dfp_count++;
 }
 
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index b4d23235931d..51e7c992d95e 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -55,7 +55,8 @@ struct xfs_defer_op_type {
 			struct list_head *item, struct xfs_btree_cur **state);
 	void (*finish_cleanup)(struct xfs_trans *tp,
 			struct xfs_btree_cur *state, int error);
-	void (*cancel_item)(struct list_head *item);
+	void (*cancel_item)(struct xfs_mount *mp, struct list_head *item);
+	void (*add_item)(struct xfs_mount *mp, const struct list_head *item);
 	unsigned int		max_items;
 };
 
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 334d970b1314..0f52356d6ed3 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -333,7 +333,9 @@ xrep_bmap_scan_rt(
 	if (xrep_is_rtmeta_ino(sc, sc->ip->i_ino))
 		return 0;
 
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 	xrep_rt_btcur_init(sc, &sc->sr);
 	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
 	xchk_rt_btcur_free(&sc->sr);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 0ea9b6b299ae..323179b3d17a 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -484,7 +484,35 @@ want_ag_read_header_failure(
  *
  * The headers should be released by xchk_ag_free, but as a fail safe we attach
  * all the buffers we grab to the scrub transaction so they'll all be freed
- * when we cancel it.  Returns ENOENT if we can't grab the perag structure.
+ * when we cancel it.
+ */
+static inline int
+__xchk_ag_read_headers(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+		return error;
+
+	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+		return error;
+
+	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+		return error;
+
+	return 0;
+}
+
+/*
+ * Grab all the headers for an AG, and wait until there aren't any pending
+ * intents.  Returns -ENOENT if we can't grab the perag structure.
  */
 int
 xchk_ag_read_headers(
@@ -502,29 +530,83 @@ xchk_ag_read_headers(
 	return xchk_ag_lock(sc);
 }
 
-/* Lock the AG headers. */
+static inline bool
+xchk_ag_intents_pending(
+	struct xfs_perag	*pag)
+{
+	int			intents = atomic_read(&pag->pag_intents);
+
+	trace_xchk_ag_read_headers(pag->pag_mount, pag->pag_agno, intents,
+			_RET_IP_);
+
+	return intents > 0;
+}
+
+/* Lock the AG headers, waiting for pending intents to drain. */
 int
 xchk_ag_lock(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_mount	*mp = sc->mp;
 	struct xchk_ag		*sa = &sc->sa;
-	xfs_agnumber_t		agno = sa->pag->pag_agno;
-	int			error;
+	int			error = 0;
 
-	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
-		return error;
+	ASSERT(sa->pag != NULL);
+	ASSERT(sa->agi_bp == NULL);
+	ASSERT(sa->agf_bp == NULL);
+	ASSERT(sa->agfl_bp == NULL);
 
-	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
-		return error;
+	do {
+		if (xchk_should_terminate(sc, &error))
+			break;
 
-	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
-		return error;
+		error = __xchk_ag_read_headers(sc, sa->pag->pag_agno, sa);
+		if (error)
+			break;
 
-	return 0;
+		/*
+		 * Decide if this AG is quiet enough for all metadata to be
+		 * consistent with each other.  XFS allows the AG header buffer
+		 * locks to cycle across transaction rolls while processing
+		 * chains of deferred ops, which means that there could be
+		 * other threads in the middle of processing a chain of
+		 * deferred ops.  For regular operations we are careful about
+		 * ordering operations to prevent collisions between threads
+		 * (which is why we don't need a per-AG lock), but scrub and
+		 * repair have to serialize against chained operations.
+		 *
+		 * We just locked all the AG headers buffers; now take a look
+		 * to see if there are any intents in progress.  If there are,
+		 * drop the AG headers and wait for the intents to drain.
+		 * Since we hold all the AG header locks for the duration of
+		 * the scrub, this is the only time we have to sample the
+		 * intents counter; any threads increasing it after this point
+		 * can't possibly be in the middle of a chain of AG metadata
+		 * updates.
+		 */
+		if (!xchk_ag_intents_pending(sa->pag)) {
+			error = 0;
+			break;
+		}
+
+		if (sa->agfl_bp) {
+			xfs_trans_brelse(sc->tp, sa->agfl_bp);
+			sa->agfl_bp = NULL;
+		}
+
+		if (sa->agf_bp) {
+			xfs_trans_brelse(sc->tp, sa->agf_bp);
+			sa->agf_bp = NULL;
+		}
+
+		if (sa->agi_bp) {
+			xfs_trans_brelse(sc->tp, sa->agi_bp);
+			sa->agi_bp = NULL;
+		}
+
+		error = xfs_perag_wait_intents(sa->pag);
+	} while (!error);
+
+	return error;
 }
 
 /* Release all the AG btree cursors. */
@@ -653,14 +735,62 @@ xchk_ag_init(
 	return 0;
 }
 
-/* Lock everything we need to work on realtime metadata. */
-void
+#if IS_ENABLED(CONFIG_XFS_RT)
+static inline bool
+xchk_rt_intents_pending(
+	struct xfs_mount	*mp)
+{
+	int			intents = atomic_read(&mp->m_rt_intents);
+
+	trace_xchk_rt_lock(mp, -1U, intents, _RET_IP_);
+
+	return intents > 0;
+}
+#else
+# define xchk_rt_intents_pending(mp)	(false)
+#endif
+
+/* Lock everything we need to work on realtime metadata and wait for intents. */
+int
 xchk_rt_lock(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
 {
-	xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
-	sr->locked = true;
+	int			error = 0;
+
+	do {
+		if (xchk_should_terminate(sc, &error))
+			break;
+
+		xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
+
+		/*
+		 * Decide if the RT volume is quiet enough for all metadata to
+		 * be consistent with each other.  Regular file IO doesn't get
+		 * to lock all the rt inodes at the same time, which means that
+		 * there could be other threads in the middle of processing a
+		 * chain of deferred ops.
+		 *
+		 * We just locked all the rt inodes; now take a look to see if
+		 * there are any rt intents in progress.  If there are, drop
+		 * the rt inode locks and wait for the intents to drain.  Since
+		 * we hold the rt inode locks for the duration of the scrub,
+		 * this is the only time we have to sample the intents counter;
+		 * any threads increasing it after this point can't possibly be
+		 * in the middle of a chain of rt metadata updates.
+		 */
+		if (!xchk_rt_intents_pending(sc->mp)) {
+			sr->locked = true;
+			error = 0;
+			break;
+		}
+
+		xfs_rtunlock(sc->mp, XFS_RTLOCK_ALL);
+
+		error = xfs_rt_wait_intents(sc->mp);
+	} while (!error);
+
+	return error;
 }
 
 /*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 74f0606174df..819bb7e2007a 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -157,7 +157,7 @@ xchk_ag_init_existing(
 
 void xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_btcur_free(struct xchk_rt *sr);
-void xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 97605313f097..8eec28aa1a95 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -597,7 +597,9 @@ xrep_dinode_count_rt_rmaps(
 	    xrep_is_rtmeta_ino(sc, sc->sm->sm_ino))
 		return 0;
 
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 	xrep_rt_btcur_init(sc, &sc->sr);
 	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
 			dis);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 47592ca215c6..e35c2c8b0513 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -477,6 +477,7 @@ xrep_newbt_schedule_reap(
 
 	INIT_LIST_HEAD(&efi_item.xefi_list);
 	list_add(&efi_item.xefi_list, &items);
+	xfs_fs_bump_intents(xnr->sc->mp, false, resv->fsbno);
 	resv->efi = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
 			&items, 1, false);
 }
@@ -698,6 +699,7 @@ xrep_newbt_destroy(
 			goto junkit;
 
 		list_del(&resv->list);
+		xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
 		kmem_free(resv);
 	}
 
@@ -710,6 +712,7 @@ junkit:
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 		xfs_extent_free_defer_type.abort_intent(resv->efi);
 		list_del(&resv->list);
+		xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
 		kmem_free(resv);
 	}
 
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index f669f51f6f70..d68ef4d1ec16 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -569,7 +569,9 @@ xrep_rtrmap_find_rmaps(
 	error = xchk_setup_fs(sc);
 	if (error)
 		return error;
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 
 	/* Scan for old rtrmap blocks. */
 	for_each_perag(sc->mp, agno, pag) {
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 9878ee415e8b..5c43578a174e 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -717,6 +717,37 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
 		  __entry->cluster_ino)
 )
 
+DECLARE_EVENT_CLASS(xchk_ag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u refcount %d caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_XCHK_AG_EVENT(name)	\
+DEFINE_EVENT(xchk_ag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_XCHK_AG_EVENT(xchk_ag_read_headers);
+DEFINE_XCHK_AG_EVENT(xchk_rt_lock);
+
 TRACE_EVENT(xchk_fscounters_calc,
 	TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
 		 uint64_t fdblocks, uint64_t delalloc),
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 460876300451..d7bf99f10e4c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -351,14 +351,30 @@ xfs_bmap_update_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_bmap_intent		*bi;
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_fsblock_t			orig_startblock;
 	int				error;
 
 	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+	orig_startblock = bi->bi_bmap.br_startblock;
 	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
 	if (!error && bi->bi_bmap.br_blockcount > 0) {
 		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
 		return -EAGAIN;
 	}
+
+	/*
+	 * Drop our intent counter reference now that we've either queued a
+	 * deferred rmap intent or failed.  Be careful to use the original
+	 * startblock since the finishing functions can update the intent
+	 * state.
+	 */
+	if (xfs_has_rmapbt(mp)) {
+		bool rt = xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork);
+
+		xfs_fs_drop_intents(mp, rt, orig_startblock);
+	}
+
 	kmem_free(bi);
 	return error;
 }
@@ -371,17 +387,47 @@ xfs_bmap_update_abort_intent(
 	xfs_bui_release(BUI_ITEM(intent));
 }
 
-/* Cancel a deferred rmap update. */
+/* Cancel a deferred bmap update. */
 STATIC void
 xfs_bmap_update_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_bmap_intent		*bi;
 
 	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	/* Drop our intent counter reference since we're going away. */
+	if (xfs_has_rmapbt(mp)) {
+		bool rt = xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork);
+
+		xfs_fs_drop_intents(mp, rt, bi->bi_bmap.br_startblock);
+	}
+
 	kmem_free(bi);
 }
 
+/* Add a deferred bmap update. */
+STATIC void
+xfs_bmap_update_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_bmap_intent	*bi;
+
+	bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	/*
+	 * Grab an intent counter reference on behalf of the deferred rmap
+	 * intent item that we will queue when we finish this bmap work.
+	 */
+	if (xfs_has_rmapbt(mp)) {
+		bool rt = xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork);
+
+		xfs_fs_bump_intents(mp, rt, bi->bi_bmap.br_startblock);
+	}
+}
+
 const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
 	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_bmap_update_create_intent,
@@ -389,6 +435,7 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
 	.create_done	= xfs_bmap_update_create_done,
 	.finish_item	= xfs_bmap_update_finish_item,
 	.cancel_item	= xfs_bmap_update_cancel_item,
+	.add_item	= xfs_bmap_update_add_item,
 };
 
 /* Is this recovered BUI ok? */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f93b033b447e..e38e428eeb59 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -487,6 +487,7 @@ xfs_extent_free_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
+	struct xfs_mount		*mp = tp->t_mountp;
 	struct xfs_extent_free_item	*xefi;
 	int				error;
 
@@ -502,6 +503,14 @@ xfs_extent_free_finish_item(
 	}
 
 	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+
+	/*
+	 * Drop our intent counter reference now that we've finished all the
+	 * work or failed.  The finishing function doesn't update the intent
+	 * state, so we need not preserve the original startblock.
+	 */
+	xfs_fs_drop_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
+
 	kmem_free(xefi);
 	return error;
 }
@@ -517,14 +526,30 @@ xfs_extent_free_abort_intent(
 /* Cancel a free extent. */
 STATIC void
 xfs_extent_free_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_extent_free_item	*xefi;
 
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+	xfs_fs_drop_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
 	kmem_free(xefi);
 }
 
+/* Add a deferred free extent. */
+STATIC void
+xfs_extent_free_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_extent_free_item *xefi;
+
+	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+	/* Grab an intent counter reference for this intent item. */
+	xfs_fs_bump_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
+}
+
 const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_extent_free_create_intent,
@@ -532,6 +557,7 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.create_done	= xfs_extent_free_create_done,
 	.finish_item	= xfs_extent_free_finish_item,
 	.cancel_item	= xfs_extent_free_cancel_item,
+	.add_item	= xfs_extent_free_add_item,
 };
 
 /*
@@ -585,6 +611,8 @@ xfs_agfl_free_finish_item(
 	extp->ext_len = xefi->xefi_blockcount;
 	efdp->efd_next_extent++;
 
+	xfs_fs_drop_intents(mp, xefi->xefi_realtime, xefi->xefi_startblock);
+
 	kmem_free(xefi);
 	return error;
 }
@@ -597,6 +625,7 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
 	.create_done	= xfs_extent_free_create_done,
 	.finish_item	= xfs_agfl_free_finish_item,
 	.cancel_item	= xfs_extent_free_cancel_item,
+	.add_item	= xfs_extent_free_add_item,
 };
 
 /* Is this recovered EFI ok? */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f0643442aecc..b02be0019ac8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1467,3 +1467,98 @@ xfs_hook_call(
 	return srcu_notifier_call_chain(&chain->head, val, priv);
 }
 #endif /* CONFIG_XFS_LIVE_HOOKS */
+
+#ifdef CONFIG_XFS_ONLINE_SCRUB
+# ifdef CONFIG_XFS_RT
+static inline void
+xfs_rt_bump_intents(
+	struct xfs_mount	*mp)
+{
+	trace_xfs_rt_bump_intents(mp, __return_address);
+
+	atomic_inc(&mp->m_rt_intents);
+}
+
+static inline void
+xfs_rt_drop_intents(
+	struct xfs_mount	*mp)
+{
+	trace_xfs_rt_drop_intents(mp, __return_address);
+
+	ASSERT(atomic_read(&mp->m_rt_intents) > 0);
+
+	if (atomic_dec_and_test(&mp->m_rt_intents))
+		wake_up(&mp->m_rt_intents_wq);
+}
+
+int
+xfs_rt_wait_intents(
+	struct xfs_mount	*mp)
+{
+	trace_xfs_rt_wait_intents(mp, __return_address);
+
+	return wait_event_killable(mp->m_rt_intents_wq,
+			atomic_read(&mp->m_rt_intents) == 0);
+}
+# else
+static inline void xfs_rt_bump_intents(struct xfs_mount *mp) { }
+static inline void xfs_rt_drop_intents(struct xfs_mount *mp) { }
+# endif /* CONFIG_XFS_RT */
+
+static inline void
+xfs_ag_bump_intents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
+	trace_xfs_perag_bump_intents(pag, __return_address);
+
+	atomic_inc(&pag->pag_intents);
+	xfs_perag_put(pag);
+}
+
+static inline void
+xfs_ag_drop_intents(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
+	trace_xfs_perag_drop_intents(pag, __return_address);
+
+	ASSERT(atomic_read(&pag->pag_intents) > 0);
+
+	if (atomic_dec_and_test(&pag->pag_intents))
+		wake_up(&pag->pag_intents_wq);
+	xfs_perag_put(pag);
+}
+
+void
+xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb)
+{
+	if (isrt)
+		xfs_rt_bump_intents(mp);
+	else
+		xfs_ag_bump_intents(mp, XFS_FSB_TO_AGNO(mp, fsb));
+}
+
+void
+xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb)
+{
+	if (isrt)
+		xfs_rt_drop_intents(mp);
+	else
+		xfs_ag_drop_intents(mp, XFS_FSB_TO_AGNO(mp, fsb));
+}
+
+int
+xfs_perag_wait_intents(
+	struct xfs_perag	*pag)
+{
+	trace_xfs_perag_wait_intents(pag, __return_address);
+
+	return wait_event_killable(pag->pag_intents_wq,
+			atomic_read(&pag->pag_intents) == 0);
+}
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8ff3fc3e83de..05b791aa24d1 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -274,6 +274,17 @@ typedef struct xfs_mount {
 
 	/* online file link count check stuff */
 	struct xfs_hook_chain	m_nlink_delta_hooks;
+
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+	/*
+	 * Counter of live intents.  We track the number of log intent items
+	 * that have been queued (but not yet processed) so that scrub can
+	 * detect the presence of other threads that are in the middle of
+	 * processing a chain of deferred items.
+	 */
+	atomic_t		m_rt_intents;
+	wait_queue_head_t	m_rt_intents_wq;
+#endif
 } xfs_mount_t;
 
 /*
@@ -593,4 +604,26 @@ struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
 void xfs_force_summary_recalc(struct xfs_mount *mp);
 void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
 
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+# if IS_ENABLED(CONFIG_XFS_RT)
+int xfs_rt_wait_intents(struct xfs_mount *mp);
+# else
+#  define xfs_rt_wait_intents(mp)		(-ENOSYS)
+# endif /* CONFIG_XFS_RT */
+
+void xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb);
+void xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb);
+int xfs_perag_wait_intents(struct xfs_perag *pag);
+
+#else
+static inline void
+xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb) { }
+static inline void
+xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb) { }
+
+int xfs_perag_wait_intents(struct xfs_perag *pag);
+# define xfs_perag_wait_intents(pag)		(-ENOSYS)
+# define xfs_rt_wait_intents(mp)		(-ENOSYS)
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
 #endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 5812e6e1fc06..b50248f1df8a 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -364,9 +364,12 @@ xfs_refcount_update_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_refcount_intent	*ri;
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_fsblock_t			orig_startblock;
 	int				error;
 
 	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	orig_startblock = ri->ri_startblock;
 	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
 			state);
 
@@ -376,6 +379,13 @@ xfs_refcount_update_finish_item(
 		       ri->ri_type == XFS_REFCOUNT_DECREASE);
 		return -EAGAIN;
 	}
+
+	/*
+	 * Drop our intent counter reference now that we've finished all the
+	 * work or failed.  Be careful to use the original startblock because
+	 * the finishing functions can update the intent state.
+	 */
+	xfs_fs_drop_intents(mp, ri->ri_realtime, orig_startblock);
 	kmem_free(ri);
 	return error;
 }
@@ -391,14 +401,29 @@ xfs_refcount_update_abort_intent(
 /* Cancel a deferred refcount update. */
 STATIC void
 xfs_refcount_update_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_refcount_intent	*ri;
 
 	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	xfs_fs_drop_intents(mp, ri->ri_realtime, ri->ri_startblock);
 	kmem_free(ri);
 }
 
+/* Add a deferred refcount update. */
+STATIC void
+xfs_refcount_update_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_refcount_intent *ri;
+
+	/* Grab an intent counter reference for this intent item. */
+	ri = container_of(item, struct xfs_refcount_intent, ri_list);
+	xfs_fs_bump_intents(mp, ri->ri_realtime, ri->ri_startblock);
+}
+
 const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_refcount_update_create_intent,
@@ -407,6 +432,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
 	.finish_item	= xfs_refcount_update_finish_item,
 	.finish_cleanup = xfs_refcount_finish_one_cleanup,
 	.cancel_item	= xfs_refcount_update_cancel_item,
+	.add_item	= xfs_refcount_update_add_item,
 };
 
 /* Is this recovered CUI ok? */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index f08ed05c98be..de6b122ac126 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -411,11 +411,19 @@ xfs_rmap_update_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_rmap_intent		*ri;
+	struct xfs_mount		*mp = tp->t_mountp;
 	int				error;
 
 	ri = container_of(item, struct xfs_rmap_intent, ri_list);
 	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
 			state);
+
+	/*
+	 * Drop our intent counter reference now that we've finished all the
+	 * work or failed.  The finishing function doesn't update the intent
+	 * state, so we need not preserve the original startblock.
+	 */
+	xfs_fs_drop_intents(mp, ri->ri_realtime, ri->ri_bmap.br_startblock);
 	kmem_free(ri);
 	return error;
 }
@@ -431,14 +439,30 @@ xfs_rmap_update_abort_intent(
 /* Cancel a deferred rmap update. */
 STATIC void
 xfs_rmap_update_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_rmap_intent		*ri;
 
 	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+	xfs_fs_drop_intents(mp, ri->ri_realtime, ri->ri_bmap.br_startblock);
 	kmem_free(ri);
 }
 
+/* Add a deferred rmap update. */
+STATIC void
+xfs_rmap_update_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+	const struct xfs_rmap_intent	*ri;
+
+	ri = container_of(item, struct xfs_rmap_intent, ri_list);
+
+	/* Grab an intent counter reference for this intent item. */
+	xfs_fs_bump_intents(mp, ri->ri_realtime, ri->ri_bmap.br_startblock);
+}
+
 const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_rmap_update_create_intent,
@@ -447,6 +471,7 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
 	.finish_item	= xfs_rmap_update_finish_item,
 	.finish_cleanup = xfs_rmap_finish_one_cleanup,
 	.cancel_item	= xfs_rmap_update_cancel_item,
+	.add_item	= xfs_rmap_update_add_item,
 };
 
 /* Is this recovered RUI ok? */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f0f2e478a794..c3f3c669a1dc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -733,6 +733,9 @@ xfs_mount_free(
 
 	ASSERT(!mutex_is_locked(&mp->m_scrub_freeze));
 	mutex_destroy(&mp->m_scrub_freeze);
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+	ASSERT(atomic_read(&mp->m_rt_intents) == 0);
+#endif
 
 	kmem_free(mp);
 }
@@ -1985,6 +1988,10 @@ static int xfs_init_fs_context(
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+	init_waitqueue_head(&mp->m_rt_intents_wq);
+	atomic_set(&mp->m_rt_intents, 0);
+#endif
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
 	 * recovery, so we must set this to true so that an ifree transaction
diff --git a/fs/xfs/xfs_swapext_item.c b/fs/xfs/xfs_swapext_item.c
index 95041fe69ba7..93d1f27cf3b9 100644
--- a/fs/xfs/xfs_swapext_item.c
+++ b/fs/xfs/xfs_swapext_item.c
@@ -346,6 +346,7 @@ xfs_swapext_abort_intent(
 /* Cancel a deferred swapext update. */
 STATIC void
 xfs_swapext_cancel_item(
+	struct xfs_mount		*mp,
 	struct list_head		*item)
 {
 	struct xfs_swapext_intent	*sxi;
@@ -354,6 +355,14 @@ xfs_swapext_cancel_item(
 	kmem_free(sxi);
 }
 
+/* Add a deferred swapext update. */
+STATIC void
+xfs_swapext_add_item(
+	struct xfs_mount		*mp,
+	const struct list_head		*item)
+{
+}
+
 const struct xfs_defer_op_type xfs_swapext_defer_type = {
 	.max_items	= XFS_SXI_MAX_FAST_EXTENTS,
 	.create_intent	= xfs_swapext_create_intent,
@@ -361,6 +370,7 @@ const struct xfs_defer_op_type xfs_swapext_defer_type = {
 	.create_done	= xfs_swapext_create_done,
 	.finish_item	= xfs_swapext_finish_item,
 	.cancel_item	= xfs_swapext_cancel_item,
+	.add_item	= xfs_swapext_add_item,
 };
 
 /* Is this recovered SXI ok? */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7879e11115b8..3e145cc5762d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2748,6 +2748,44 @@ DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
 DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer);
 DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred);
 
+DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
+		 void *item),
+	TP_ARGS(mp, dfp, item),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, type)
+		__field(void *, intent)
+		__field(void *, item)
+		__field(char, committed)
+		__field(int, nr)
+	),
+	TP_fast_assign(
+		__entry->dev = mp ? mp->m_super->s_dev : 0;
+		__entry->type = dfp->dfp_type;
+		__entry->intent = dfp->dfp_intent;
+		__entry->item = item;
+		__entry->committed = dfp->dfp_done != NULL;
+		__entry->nr = dfp->dfp_count;
+	),
+	TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->intent,
+		  __entry->item,
+		  __entry->committed,
+		  __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_item_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \
+		 void *item), \
+	TP_ARGS(mp, dfp, item))
+
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
+
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
 	TP_PROTO(struct xfs_btree_cur *cur,
@@ -4893,6 +4931,74 @@ DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_free_extent);
 DEFINE_IMETA_RESV_EVENT(xfs_imeta_resv_critical);
 DEFINE_INODE_ERROR_EVENT(xfs_imeta_resv_init_error);
 
+DECLARE_EVENT_CLASS(xfs_perag_intents_class,
+	TP_PROTO(struct xfs_perag *pag, void *caller_ip),
+	TP_ARGS(pag, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(long, nr_intents)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+		__entry->nr_intents = atomic_read(&pag->pag_intents);
+#else
+		__entry->nr_intents = -1;
+#endif
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->nr_intents,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_PERAG_INTENTS_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_intents_class, name,					\
+	TP_PROTO(struct xfs_perag *pag, void *caller_ip), \
+	TP_ARGS(pag, caller_ip))
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_bump_intents);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_drop_intents);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+
+DECLARE_EVENT_CLASS(xfs_rt_intents_class,
+	TP_PROTO(struct xfs_mount *mp, void *caller_ip),
+	TP_ARGS(mp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(long, nr_intents)
+		__field(void *, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+		__entry->nr_intents = atomic_read(&mp->m_rt_intents);
+#else
+		__entry->nr_intents = -1;
+#endif
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d intents %ld caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->nr_intents,
+		  __entry->caller_ip)
+);
+
+#define DEFINE_RT_INTENTS_EVENT(name)	\
+DEFINE_EVENT(xfs_rt_intents_class, name,					\
+	TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+	TP_ARGS(mp, caller_ip))
+DEFINE_RT_INTENTS_EVENT(xfs_rt_bump_intents);
+DEFINE_RT_INTENTS_EVENT(xfs_rt_drop_intents);
+DEFINE_RT_INTENTS_EVENT(xfs_rt_wait_intents);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
author	Darrick J. Wong <djwong@kernel.org>	2021-10-22 15:31:05 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-10-22 16:41:15 -0700
commit	b9872a3e43dcb62d05ee10f93ce45940e0674487 (patch)
tree	d944f29a906b713c507b29d47cd9137858ab9df8
parent	c5355cbaca02979360a5f1227ae3c4971222dc3d (diff)