summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorThomas Zimmermann <tzimmermann@suse.de>2022-11-21 09:03:13 +0100
committerThomas Zimmermann <tzimmermann@suse.de>2022-11-21 09:03:13 +0100
commit29583dfcd2dd72c766422bd05c16f06c6b1fb356 (patch)
treea35bc4aa5e84ce6ae0df1b43ca431f6cd8f38997 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent39dd0cc2e5bd0d5188dd69f27e18783cea7ff06a (diff)
parent4e291f2f585313efa5200cce655e17c94906e50a (diff)
Merge drm/drm-next into drm-misc-next-fixes
Backmerging to update drm-misc-next-fixes for the final phase of the release cycle. Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c67
1 files changed, 29 insertions, 38 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ab9ba5a9c33d..693bce07eb46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
{
bool poison_stat = false;
struct amdgpu_device *adev = obj->adev;
- struct ras_err_data err_data = {0, 0, 0, NULL};
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
@@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
if (!adev->gmc.xgmi.connected_to_cpu)
- amdgpu_umc_poison_handler(adev, &err_data, false);
+ amdgpu_umc_poison_handler(adev, false);
if (block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
@@ -1811,7 +1810,8 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
amdgpu_ras_query_error_status(adev, &info);
if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
- adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
if (amdgpu_ras_reset_error_status(adev, info.head.block))
dev_warn(adev->dev, "Failed to reset error counter and error status");
}
@@ -1949,7 +1949,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
}
@@ -2267,6 +2266,25 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
{
+ if (amdgpu_sriov_vf(adev)) {
+ switch (adev->ip_versions[MP0_HWIP][0]) {
+ case IP_VERSION(13, 0, 2):
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ if (adev->asic_type == CHIP_IP_DISCOVERY) {
+ switch (adev->ip_versions[MP0_HWIP][0]) {
+ case IP_VERSION(13, 0, 0):
+ case IP_VERSION(13, 0, 10):
+ return true;
+ default:
+ return false;
+ }
+ }
+
return adev->asic_type == CHIP_VEGA10 ||
adev->asic_type == CHIP_VEGA20 ||
adev->asic_type == CHIP_ARCTURUS ||
@@ -2310,11 +2328,6 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
!amdgpu_ras_asic_supported(adev))
return;
- /* If driver run on sriov guest side, only enable ras for aldebaran */
- if (amdgpu_sriov_vf(adev) &&
- adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2))
- return;
-
if (!adev->gmc.xgmi.connected_to_cpu) {
if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
dev_info(adev->dev, "MEM ECC is active.\n");
@@ -2719,7 +2732,8 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
/* Need disable ras on all IPs here before ip [hw/sw]fini */
- amdgpu_ras_disable_all_features(adev, 0);
+ if (con->features)
+ amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
return 0;
}
@@ -2832,11 +2846,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
struct mce *m = (struct mce *)data;
struct amdgpu_device *adev = NULL;
uint32_t gpu_id = 0;
- uint32_t umc_inst = 0;
- uint32_t ch_inst, channel_index = 0;
- struct ras_err_data err_data = {0, 0, 0, NULL};
- struct eeprom_table_record err_rec;
- uint64_t retired_page;
+ uint32_t umc_inst = 0, ch_inst = 0;
/*
* If the error was generated in UMC_V2, which belongs to GPU UMCs,
@@ -2875,29 +2885,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
umc_inst, ch_inst);
- /*
- * Translate UMC channel address to Physical address
- */
- channel_index =
- adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
- + ch_inst];
-
- retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
- ADDR_OF_256B_BLOCK(channel_index) |
- OFFSET_IN_256B_BLOCK(m->addr);
-
- memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
- err_data.err_addr = &err_rec;
- amdgpu_umc_fill_error_record(&err_data, m->addr,
- retired_page, channel_index, umc_inst);
-
- if (amdgpu_bad_page_threshold != 0) {
- amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
- err_data.err_addr_cnt);
- amdgpu_ras_save_bad_pages(adev);
- }
-
- return NOTIFY_OK;
+ if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst))
+ return NOTIFY_OK;
+ else
+ return NOTIFY_DONE;
}
static struct notifier_block amdgpu_bad_page_nb = {