summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-02-22 18:28:03 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2023-02-22 18:28:03 -0800
commita5c95ca18a98d742d0a4a04063c32556b5b66378 (patch)
treefdd897b23a1c45b3d03bd1e75e5df42057f339d1 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent307e14c039063f0c9bd7a18a7add8f940580dcc9 (diff)
parenta48bba98380cb0b43dcd01d276c7efc282e3c33f (diff)
Merge tag 'drm-next-2023-02-23' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie: "There are a bunch of changes all over in the usual places. Highlights: - habanalabs moves from misc to accel - first accel driver for Intel VPU (Versatile Processing Unit) inference engine - dropped all the ancient legacy DRI1 drivers. I think it's been at least 10 years since anyone has heard about these. - Intel DG2 updates and prelim Meteorlake enablement - etnaviv adds support for Versilicon NPU device (a GPU like engine with inference accelerators) Detailed summary: Removals: - remove legacy dri1 drivers: i810, mga, r128, savage, sis, tdfx, via New driver: - intel VPU accelerator driver - habanalabs comes via drm tree now drm/core: - use drm_dbg_ helpers in several places - Document defaults for CRTC backgrounds - Document use of drm_minor edid: - improve mode parsing and refactoring connector: - support analog TV mode property media: - add some common formats udmabuf: - add vmap/vunmap methods fourcc: - add XRGB1555 and RGB565 formats - document open source user waiver firmware: - fix color-format selection for system framebuffer format-helper: - Add conversion from XRGB8888 to various sysfb formats - Make XRGB8888 the only driver-emulated legacy format - Add conversion from XRGB8888 to XBGR8888 and ABGR8888 fb-helper: - fix preferred depth and bpp values across drivers - Avoid blank consoles from selecting an incorrect color format probe-helper: - Enable/disable HPD on connectors scheduler: - Fix lockup in drm_sched_entity_kill() - Deprecate drm_sched_resubmit_jobs() bridge: - remove unused functions - implement i2c probe_new in various drivers - ite-it6505: Locking fixes, Cache EDID data - ite-it66121: Support IT6610 chip - lontium-tl9611: Fix HDMI on DragonBoard 845c - parade-ps8640: Use atomic bridge functions - Support i.MX93 LDB plus DT bindings debugfs: - add per device helpers and convert drivers displayport: - mst fixes - add DP adaptive sync DPCD definitions fbdev: - always pick 32bpp as default - remove some unused code simpledrm: - support system memory framebuffers panel: - add orientation quirks for Lenovo Yoga Tab 3 X90F and DynaBook K50 - Use ktime_get_boottime() to measure power-down delay - Fix auto-suspend delay - Visionox VTDR6130 AMOLED DSI - Support Himax HX8394 - Convert many drivers to common generic DSI write-sequence helper - AUO A030JTN01 ttm: - drop bo wait wrapper - fix MIPS build habanalabs: - moved driver to accel subsystem - gaudi2 decoder error improvement - more trace events - Gaudi2 abrupt reset by firmware support - add uAPI to flush memory transactions - add uAPI to pass through userspace reqs to fw - remove dma-buf export by handle amdgpu: - add new INFO queries for peak and min sclk/mclk for profile modes - Add PCIe info to the INFO IOCTL - secure display support for multiple displays - DML optimizations - DCN 3.2 updates - PSR updates - DP 2.1 updates - SR-IOV RAS updates - VCN RAS support - SMU 13.x updates - Switch 1 element arrays to flexible arrays - Add RAS support for DF 4.3 - Stack size improvements - S0ix rework - Allow 0 as a vram limit on APUs - Handle profiling modes for SMU13.x - Fix possible segfault in failure case - Rework FW requests to happen in early_init for all IPs so that we don't lose the sbios console if FW is missing - Fix power reporting on certain firmwares for CZN/RN - Allow S0ix without BIOS support - Enable freesync over PCon - Re-enable the AGP aperture on GMC 11.x amdkfd: - Error handling fixes - PASID fixes - Fix for cleared VRAM BOs - Fix cleanup if GPUVM creation fails - Memory accounting fix - Use resource_size rather than open codeing it - GC11 mGPU fix radeon: - Switch 1 element arrays to flexible arrays - Fix memory leak on shutdown - move to new logging i915: - Meteorlake display/OA/GSC fw/workarounds enabling - DP MST DSC support - Gamma/degamma readout support for the state checker - Enable SDP split support for DP 2.0 - Add probe blocking support to i915.force_probe parameter - Enable Xe HP 4tile support - Avoid display direct calls to uncore - Fix HuC delayed load memory leaks - Add DG2 workarounds Wa_18018764978 and Wa_18019271663 - Improve suspend / resume times with VT-d scanout workaround active - Fix DG2 visual corruption on small BAR systems by not forgetting to copy CCS aux state - Fix TLB invalidation for Gen12.50 video and compute engines - Enable HF-EEODB by switching HDMI, DP and LVDS to use struct drm_edid - Start using unversioned DMC firmware paths for new platforms - ELD refactor: Stop using hardware buffer, precompute ELD - lots of display code refactoring nouveau: - drop legacy ioctl support - replace 0-sized array msm: - dpu/dsi/mdss: Support for SM8350, SM8450 SM8550 and SC8280XP platform - Added bindings for SM8150 - dpu: Partial support for DSC on SM8150 and SM8250 - dpu: Fixed color transformation matrix being lost on suspend/resume - dp: Support SDM845 and SC8280XP platforms - dp: Support for limiting DP link rate via DT property - dsi: Validate display modes according to the DSI OPP table - dsi: DSI PHY support for the SM6375 platform - Add MSM_SUBMIT_BO_NO_IMPLICI - a2xx: Support to load legacy firmware - a6xx: GPU devcore dump updates for a650/a660 - GPU devfreq tuning and fixes - Turn 8960 HDMI PHY into clock provider, - Make 8960 HDMI PHY use PXO clock from DT etnaviv: - experimental versilicon NPU support - report GPU load via fdinfo format - MMU fault message improvements tegra: - rework syncpoint interrupt mediatek: - DSI timing fix - fix config deps ast: - various fixes exynos: - restore bridge chain order fixes gud: - convert to shadow plane buffers - perform flushing synchronously during atomic update - Use new debugfs helpers arm/hdlcd: - Use new debugfs helper ili9486: - Support 16-bit pixel data imx: - Split off IPUv3 driver mipi-dbi: - convert to DRM shadow-plane helpers - rsp driver changes - Support separate I/O-voltage supply mxsfb: - Depend on ARCH_MXS or ARCH_MXC sun4i: - convert to new TV mode property vc4: - convert to new TV mode property - kunit tests - Support RGB565 and RGB666 formats - convert dsi driver to bridge - Various HVS an CRTC fixes v3d: - Do not opencode drm_gem_object_lookup() virtio: - improve tracing vkms: - support small cursors in IGT tests - Fix SEGFAULT from incorrect GEM-buffer mapping rcar-du: - fixes and improvements" * tag 'drm-next-2023-02-23' of git://anongit.freedesktop.org/drm/drm: (1455 commits) msm/fbdev: fix unused variable warning with clang. drm/fb-helper: Remove drm_fb_helper_unprepare() from drm_fb_helper_fini() dma-buf: make kobj_type structure constant drm/shmem-helper: Fix locking for drm_gem_shmem_get_pages_sgt() drm/amd/display: disable SubVP + DRR to prevent underflow drm/amd/display: Fail atomic_check early on normalize_zpos error drm/amd/pm: avoid unaligned access warnings drm/amd/display: avoid unaligned access warnings drm/amd/display: Remove duplicate/repeating expressions drm/amd/display: Remove duplicate/repeating expression drm/amd/display: Make variables declaration inside ifdef guard drm/amd/display: Fix excess arguments on kernel-doc drm/amd/display: Add previously missing includes drm/amd/amdgpu: Add function prototypes to headers drm/amd/display: Add function prototypes to headers drm/amd/display: Turn global functions into static drm/amd/display: remove unused _calculate_degamma_curve function drm/amd/display: remove unused func declaration from resource headers drm/amd/display: unset initial value for tf since it's never used drm/amd/display: camel case cleanup in color_gamma file ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c215
1 files changed, 153 insertions, 62 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ad490c1e2f57..6e543558386d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -706,13 +706,23 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
return 0;
}
+static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
+ struct ras_common_if *head)
+{
+ if (amdgpu_ras_is_feature_allowed(adev, head) ||
+ amdgpu_ras_is_poison_mode_supported(adev))
+ return 1;
+ else
+ return 0;
+}
+
/* wrapper of psp_ras_enable_features */
int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
union ta_ras_cmd_input *info;
- int ret;
+ int ret = 0;
if (!con)
return -EINVAL;
@@ -736,7 +746,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
}
/* Do not enable if it is not allowed. */
- WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
+ if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
+ goto out;
/* Only enable ras feature operation handle on host side */
if (head->block == AMDGPU_RAS_BLOCK__GFX &&
@@ -754,7 +765,6 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
- ret = 0;
out:
if (head->block == AMDGPU_RAS_BLOCK__GFX)
kfree(info);
@@ -910,9 +920,6 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de
if (block >= AMDGPU_RAS_BLOCK__LAST)
return NULL;
- if (!amdgpu_ras_is_supported(adev, block))
- return NULL;
-
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
@@ -1087,6 +1094,10 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
info->head.block,
info->head.sub_block_index);
+ /* inject on guest isn't allowed, return success directly */
+ if (amdgpu_sriov_vf(adev))
+ return 0;
+
if (!obj)
return -EINVAL;
@@ -1122,11 +1133,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
}
/**
- * amdgpu_ras_query_error_count -- Get error counts of all IPs
+ * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
+ * @adev: pointer to AMD GPU device
+ * @ce_count: pointer to an integer to be set to the count of correctible errors.
+ * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
+ * @query_info: pointer to ras_query_if
+ *
+ * Return 0 for query success or do nothing, otherwise return an error
+ * on failures
+ */
+static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count,
+ struct ras_query_if *query_info)
+{
+ int ret;
+
+ if (!query_info)
+ /* do nothing if query_info is not specified */
+ return 0;
+
+ ret = amdgpu_ras_query_error_status(adev, query_info);
+ if (ret)
+ return ret;
+
+ *ce_count += query_info->ce_count;
+ *ue_count += query_info->ue_count;
+
+ /* some hardware/IP supports read to clear
+ * no need to explictly reset the err status after the query call */
+ if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
+ dev_warn(adev->dev,
+ "Failed to reset error counter and error status\n");
+ }
+
+ return 0;
+}
+
+/**
+ * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
* @adev: pointer to AMD GPU device
* @ce_count: pointer to an integer to be set to the count of correctible errors.
* @ue_count: pointer to an integer to be set to the count of uncorrectible
* errors.
+ * @query_info: pointer to ras_query_if if the query request is only for
+ * specific ip block; if info is NULL, then the qurey request is for
+ * all the ip blocks that support query ras error counters/status
*
* If set, @ce_count or @ue_count, count and return the corresponding
* error counts in those integer pointers. Return 0 if the device
@@ -1134,11 +1188,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
*/
int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
unsigned long *ce_count,
- unsigned long *ue_count)
+ unsigned long *ue_count,
+ struct ras_query_if *query_info)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
unsigned long ce, ue;
+ int ret;
if (!adev->ras_enabled || !con)
return -EOPNOTSUPP;
@@ -1150,26 +1206,23 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
ce = 0;
ue = 0;
- list_for_each_entry(obj, &con->head, node) {
- struct ras_query_if info = {
- .head = obj->head,
- };
- int res;
-
- res = amdgpu_ras_query_error_status(adev, &info);
- if (res)
- return res;
+ if (!query_info) {
+ /* query all the ip blocks that support ras query interface */
+ list_for_each_entry(obj, &con->head, node) {
+ struct ras_query_if info = {
+ .head = obj->head,
+ };
- if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
- adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
- if (amdgpu_ras_reset_error_status(adev, info.head.block))
- dev_warn(adev->dev, "Failed to reset error counter and error status");
+ ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info);
}
-
- ce += info.ce_count;
- ue += info.ue_count;
+ } else {
+ /* query specific ip block */
+ ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info);
}
+ if (ret)
+ return ret;
+
if (ce_count)
*ce_count = ce;
@@ -1564,14 +1617,14 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
- if (!block_obj || !block_obj->hw_ops)
+ if (!block_obj)
return;
/* both query_poison_status and handle_poison_consumption are optional,
* but at least one of them should be implemented if we need poison
* consumption handler
*/
- if (block_obj->hw_ops->query_poison_status) {
+ if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
poison_stat = block_obj->hw_ops->query_poison_status(adev);
if (!poison_stat) {
/* Not poison consumption interrupt, no need to handle it */
@@ -1585,7 +1638,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_poison_handler(adev, false);
- if (block_obj->hw_ops->handle_poison_consumption)
+ if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
/* gpu reset is fallback for failed and default cases */
@@ -1593,6 +1646,8 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
+ } else {
+ amdgpu_gfx_poison_consumption_handler(adev, entry);
}
}
@@ -2344,22 +2399,24 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
dev_info(adev->dev, "SRAM ECC is active.\n");
- if (!amdgpu_sriov_vf(adev)) {
+ if (!amdgpu_sriov_vf(adev))
adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1 << AMDGPU_RAS_BLOCK__DF);
-
- if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
- adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
- adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
- 1 << AMDGPU_RAS_BLOCK__JPEG);
- else
- adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
- 1 << AMDGPU_RAS_BLOCK__JPEG);
- } else {
+ else
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
1 << AMDGPU_RAS_BLOCK__SDMA |
1 << AMDGPU_RAS_BLOCK__GFX);
- }
+
+ /* VCN/JPEG RAS can be supported on both bare metal and
+ * SRIOV environment
+ */
+ if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) ||
+ adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+ else
+ adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
} else {
dev_info(adev->dev, "SRAM ECC is not presented.\n");
}
@@ -2395,7 +2452,7 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
/* Cache new values.
*/
- if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
+ if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) {
atomic_set(&con->ras_ce_count, ce_count);
atomic_set(&con->ras_ue_count, ue_count);
}
@@ -2405,11 +2462,42 @@ Out:
pm_runtime_put_autosuspend(dev->dev);
}
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ bool df_poison, umc_poison;
+
+ /* poison setting is useless on SRIOV guest */
+ if (amdgpu_sriov_vf(adev) || !con)
+ return;
+
+ /* Init poison supported flag, the default value is false */
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ /* enabled by default when GPU is connected to CPU */
+ con->poison_supported = true;
+ } else if (adev->df.funcs &&
+ adev->df.funcs->query_ras_poison_mode &&
+ adev->umc.ras &&
+ adev->umc.ras->query_ras_poison_mode) {
+ df_poison =
+ adev->df.funcs->query_ras_poison_mode(adev);
+ umc_poison =
+ adev->umc.ras->query_ras_poison_mode(adev);
+
+ /* Only poison is set in both DF and UMC, we can support it */
+ if (df_poison && umc_poison)
+ con->poison_supported = true;
+ else if (df_poison != umc_poison)
+ dev_warn(adev->dev,
+ "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+ df_poison, umc_poison);
+ }
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
- bool df_poison, umc_poison;
if (con)
return 0;
@@ -2484,26 +2572,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
- /* Init poison supported flag, the default value is false */
- if (adev->gmc.xgmi.connected_to_cpu) {
- /* enabled by default when GPU is connected to CPU */
- con->poison_supported = true;
- }
- else if (adev->df.funcs &&
- adev->df.funcs->query_ras_poison_mode &&
- adev->umc.ras &&
- adev->umc.ras->query_ras_poison_mode) {
- df_poison =
- adev->df.funcs->query_ras_poison_mode(adev);
- umc_poison =
- adev->umc.ras->query_ras_poison_mode(adev);
- /* Only poison is set in both DF and UMC, we can support it */
- if (df_poison && umc_poison)
- con->poison_supported = true;
- else if (df_poison != umc_poison)
- dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
- df_poison, umc_poison);
- }
+ amdgpu_ras_query_poison_mode(adev);
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
@@ -2564,6 +2633,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
{
struct amdgpu_ras_block_object *ras_obj = NULL;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_query_if *query_info;
unsigned long ue_count, ce_count;
int r;
@@ -2605,11 +2675,17 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
/* Those are the cached values at init.
*/
- if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
+ query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
+ if (!query_info)
+ return -ENOMEM;
+ memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
+
+ if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
atomic_set(&con->ras_ce_count, ce_count);
atomic_set(&con->ras_ue_count, ue_count);
}
+ kfree(query_info);
return 0;
interrupt:
@@ -2946,11 +3022,26 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
int amdgpu_ras_is_supported(struct amdgpu_device *adev,
unsigned int block)
{
+ int ret = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (block >= AMDGPU_RAS_BLOCK_COUNT)
return 0;
- return ras && (adev->ras_enabled & (1 << block));
+
+ ret = ras && (adev->ras_enabled & (1 << block));
+
+ /* For the special asic with mem ecc enabled but sram ecc
+ * not enabled, even if the ras block is not supported on
+ * .ras_enabled, if the asic supports poison mode and the
+ * ras block has ras configuration, it can be considered
+ * that the ras block supports ras function.
+ */
+ if (!ret &&
+ amdgpu_ras_is_poison_mode_supported(adev) &&
+ amdgpu_ras_get_ras_block(adev, block, 0))
+ ret = 1;
+
+ return ret;
}
int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)