diff options
author | Connor Abbott <cwabbott0@gmail.com> | 2025-05-20 15:08:59 -0400 |
---|---|---|
committer | Rob Clark <robin.clark@oss.qualcomm.com> | 2025-06-09 11:37:34 -0700 |
commit | b13044092c1e30453d2f7e9be596d3a2616582a0 (patch) | |
tree | 6c8604644f2b5e8d27e731c1d06778c10895688e | |
parent | dedf404be8cf97e6fabbed7ad97000ab816897eb (diff) |
drm/msm: Temporarily disable stall-on-fault after a page fault
When things go wrong, the GPU is capable of quickly generating millions
of faulting translation requests per second. When that happens, in the
stall-on-fault model each access will stall until it wins the race to
signal the fault and then the RESUME register is written. This slows
processing page faults to a crawl as the GPU can generate faults much
faster than the CPU can acknowledge them. It also means that all
available resources in the SMMU are saturated waiting for the stalled
transactions, so that other transactions such as transactions generated
by the GMU, which shares translation resources with the GPU, cannot
proceed. This causes a GMU watchdog timeout, which leads to a failed
reset because GX cannot collapse when there is a transaction pending and
a permanently hung GPU.
On older platforms with qcom,smmu-v2, it seems that when one transaction
is stalled subsequent faulting transactions are terminated, which avoids
this problem, but the MMU-500 follows the spec here.
To work around these problems, disable stall-on-fault as soon as we get a
page fault until a cooldown period after pagefaults stop. This allows
the GMU some guaranteed time to continue working. We only use
stall-on-fault to halt the GPU while we collect a devcoredump and we
always terminate the transaction afterward, so it's fine to miss some
subsequent page faults. We also keep it disabled so long as the current
devcoredump hasn't been deleted, because in that case we likely won't
capture another one if there's a fault.
After this commit HFI messages still occasionally time out, because the
crashdump handler doesn't run fast enough to let the GMU resume, but the
driver seems to recover from it. This will probably go away after the
HFI timeout is increased.
Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
Reviewed-by: Rob Clark <robdclark@gmail.com>
Patchwork: https://patchwork.freedesktop.org/patch/654891/
Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com>
-rw-r--r-- | drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/adreno/adreno_gpu.c | 40 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/msm_debugfs.c | 32 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/msm_drv.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/msm_drv.h | 23 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/msm_iommu.c | 9 | ||||
-rw-r--r-- | drivers/gpu/drm/msm/msm_mmu.h | 1 |
9 files changed, 116 insertions, 1 deletions
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 650e5bac225f..60aef0796236 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -131,6 +131,8 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_ringbuffer *ring = submit->ring; unsigned int i, ibs = 0; + adreno_check_and_reenable_stall(adreno_gpu); + if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) { ring->cur_ctx_seqno = 0; a5xx_submit_in_rb(gpu, submit); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index bf3758f010f4..a7ffd92de3bb 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -212,6 +212,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_ringbuffer *ring = submit->ring; unsigned int i, ibs = 0; + adreno_check_and_reenable_stall(adreno_gpu); + a6xx_set_pagetable(a6xx_gpu, ring, submit); get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0), @@ -335,6 +337,8 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_ringbuffer *ring = submit->ring; unsigned int i, ibs = 0; + adreno_check_and_reenable_stall(adreno_gpu); + /* * Toggle concurrent binning for pagetable switch and set the thread to * BR since only it can execute the pagetable switch packets. diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 5ebd5a6ea143..86bff915c3e7 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -259,16 +259,54 @@ u64 adreno_private_address_space_size(struct msm_gpu *gpu) return BIT(ttbr1_cfg->ias) - ADRENO_VM_START; } +void adreno_check_and_reenable_stall(struct adreno_gpu *adreno_gpu) +{ + struct msm_gpu *gpu = &adreno_gpu->base; + struct msm_drm_private *priv = gpu->dev->dev_private; + unsigned long flags; + + /* + * Wait until the cooldown period has passed and we would actually + * collect a crashdump to re-enable stall-on-fault. + */ + spin_lock_irqsave(&priv->fault_stall_lock, flags); + if (!priv->stall_enabled && + ktime_after(ktime_get(), priv->stall_reenable_time) && + !READ_ONCE(gpu->crashstate)) { + priv->stall_enabled = true; + + gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, true); + } + spin_unlock_irqrestore(&priv->fault_stall_lock, flags); +} + #define ARM_SMMU_FSR_TF BIT(1) #define ARM_SMMU_FSR_PF BIT(3) #define ARM_SMMU_FSR_EF BIT(4) +#define ARM_SMMU_FSR_SS BIT(30) int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, struct adreno_smmu_fault_info *info, const char *block, u32 scratch[4]) { + struct msm_drm_private *priv = gpu->dev->dev_private; const char *type = "UNKNOWN"; - bool do_devcoredump = info && !READ_ONCE(gpu->crashstate); + bool do_devcoredump = info && (info->fsr & ARM_SMMU_FSR_SS) && + !READ_ONCE(gpu->crashstate); + unsigned long irq_flags; + + /* + * In case there is a subsequent storm of pagefaults, disable + * stall-on-fault for at least half a second. + */ + spin_lock_irqsave(&priv->fault_stall_lock, irq_flags); + if (priv->stall_enabled) { + priv->stall_enabled = false; + + gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, false); + } + priv->stall_reenable_time = ktime_add_ms(ktime_get(), 500); + spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags); /* * Print a default message if we couldn't get the data from the diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index a8f4bf416e64..bc063594a359 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -636,6 +636,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, struct adreno_smmu_fault_info *info, const char *block, u32 scratch[4]); +void adreno_check_and_reenable_stall(struct adreno_gpu *gpu); + int adreno_read_speedbin(struct device *dev, u32 *speedbin); /* diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c index 7ab607252d18..6af72162cda4 100644 --- a/drivers/gpu/drm/msm/msm_debugfs.c +++ b/drivers/gpu/drm/msm/msm_debugfs.c @@ -208,6 +208,35 @@ DEFINE_DEBUGFS_ATTRIBUTE(shrink_fops, shrink_get, shrink_set, "0x%08llx\n"); +/* + * Return the number of microseconds to wait until stall-on-fault is + * re-enabled. If 0 then it is already enabled or will be re-enabled on the + * next submit (unless there's a leftover devcoredump). This is useful for + * kernel tests that intentionally produce a fault and check the devcoredump to + * wait until the cooldown period is over. + */ + +static int +stall_reenable_time_get(void *data, u64 *val) +{ + struct msm_drm_private *priv = data; + unsigned long irq_flags; + + spin_lock_irqsave(&priv->fault_stall_lock, irq_flags); + + if (priv->stall_enabled) + *val = 0; + else + *val = max(ktime_us_delta(priv->stall_reenable_time, ktime_get()), 0); + + spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags); + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(stall_reenable_time_fops, + stall_reenable_time_get, NULL, + "%lld\n"); static int msm_gem_show(struct seq_file *m, void *arg) { @@ -319,6 +348,9 @@ static void msm_debugfs_gpu_init(struct drm_minor *minor) debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root, &priv->disable_err_irq); + debugfs_create_file("stall_reenable_time_us", 0400, minor->debugfs_root, + priv, &stall_reenable_time_fops); + gpu_devfreq = debugfs_create_dir("devfreq", minor->debugfs_root); debugfs_create_bool("idle_clamp",0600, gpu_devfreq, diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index f316e6776f67..132d4cac3587 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -245,6 +245,10 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) drm_gem_lru_init(&priv->lru.willneed, &priv->lru.lock); drm_gem_lru_init(&priv->lru.dontneed, &priv->lru.lock); + /* Initialize stall-on-fault */ + spin_lock_init(&priv->fault_stall_lock); + priv->stall_enabled = true; + /* Teach lockdep about lock ordering wrt. shrinker: */ fs_reclaim_acquire(GFP_KERNEL); might_lock(&priv->lru.lock); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index a65077855201..c8afb1ea6040 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -222,6 +222,29 @@ struct msm_drm_private { * the sw hangcheck mechanism. */ bool disable_err_irq; + + /** + * @fault_stall_lock: + * + * Serialize changes to stall-on-fault state. + */ + spinlock_t fault_stall_lock; + + /** + * @fault_stall_reenable_time: + * + * If stall_enabled is false, when to reenable stall-on-fault. + * Protected by @fault_stall_lock. + */ + ktime_t stall_reenable_time; + + /** + * @stall_enabled: + * + * Whether stall-on-fault is currently enabled. Protected by + * @fault_stall_lock. + */ + bool stall_enabled; }; const struct msm_format *mdp_get_format(struct msm_kms *kms, uint32_t format, uint64_t modifier); diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index aae885d048d0..739ce2c283a4 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -372,6 +372,14 @@ static int msm_disp_fault_handler(struct iommu_domain *domain, struct device *de return -ENOSYS; } +static void msm_iommu_set_stall(struct msm_mmu *mmu, bool enable) +{ + struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(mmu->dev); + + if (adreno_smmu->set_stall) + adreno_smmu->set_stall(adreno_smmu->cookie, enable); +} + static void msm_iommu_detach(struct msm_mmu *mmu) { struct msm_iommu *iommu = to_msm_iommu(mmu); @@ -419,6 +427,7 @@ static const struct msm_mmu_funcs funcs = { .map = msm_iommu_map, .unmap = msm_iommu_unmap, .destroy = msm_iommu_destroy, + .set_stall = msm_iommu_set_stall, }; struct msm_mmu *msm_iommu_new(struct device *dev, unsigned long quirks) diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index c3d17aae88b0..0c694907140d 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -15,6 +15,7 @@ struct msm_mmu_funcs { size_t len, int prot); int (*unmap)(struct msm_mmu *mmu, uint64_t iova, size_t len); void (*destroy)(struct msm_mmu *mmu); + void (*set_stall)(struct msm_mmu *mmu, bool enable); }; enum msm_mmu_type { |