summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-09-01 11:26:46 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-09-01 11:26:46 -0700
commit477f70cd2a67904e04c2c2b9bd0fa2e95222f2f6 (patch)
tree1897dd1de49e1ea24897163533e2d8ead5dad0ad /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent835d31d319d9c8c4eb6cac074643360ba0ecab10 (diff)
parent8f0284f190e6a0aa09015090568c03f18288231a (diff)
Merge tag 'drm-next-2021-08-31-1' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie: "Highlights: - i915 has seen a lot of refactoring and uAPI cleanups due to a change in the upstream direction going forward This has all been audited with known userspace, but there may be some pitfalls that were missed. - i915 now uses common TTM to enable discrete memory on DG1/2 GPUs - i915 enables Jasper and Elkhart Lake by default and has preliminary XeHP/DG2 support - amdgpu adds support for Cyan Skillfish - lots of implicit fencing rules documented and fixed up in drivers - msm now uses the core scheduler - the irq midlayer has been removed for non-legacy drivers - the sysfb code now works on more than x86. Otherwise the usual smattering of stuff everywhere, panels, bridges, refactorings. Detailed summary: core: - extract i915 eDP backlight into core - DP aux bus support - drm_device.irq_enabled removed - port drivers to native irq interfaces - export gem shadow plane handling for vgem - print proper driver name in framebuffer registration - driver fixes for implicit fencing rules - ARM fixed rate compression modifier added - updated fb damage handling - rmfb ioctl logging/docs - drop drm_gem_object_put_locked - define DRM_FORMAT_MAX_PLANES - add gem fb vmap/vunmap helpers - add lockdep_assert(once) helpers - mark drm irq midlayer as legacy - use offset adjusted bo mapping conversion vgaarb: - cleanups fbdev: - extend efifb handling to all arches - div by 0 fixes for multiple drivers udmabuf: - add hugepage mapping support dma-buf: - non-dynamic exporter fixups - document implicit fencing rules amdgpu: - Initial Cyan Skillfish support - switch virtual DCE over to vkms based atomic - VCN/JPEG power down fixes - NAVI PCIE link handling fixes - AMD HDMI freesync fixes - Yellow Carp + Beige Goby fixes - Clockgating/S0ix/SMU/EEPROM fixes - embed hw fence in job - rework dma-resv handling - ensure eviction to system ram amdkfd: - uapi: SVM address range query added - sysfs leak fix - GPUVM TLB optimizations - vmfault/migration counters i915: - Enable JSL and EHL by default - preliminary XeHP/DG2 support - remove all CNL support (never shipped) - move to TTM for discrete memory support - allow mixed object mmap handling - GEM uAPI spring cleaning - add I915_MMAP_OBJECT_FIXED - reinstate ADL-P mmap ioctls - drop a bunch of unused by userspace features - disable and remove GPU relocations - revert some i915 misfeatures - major refactoring of GuC for Gen11+ - execbuffer object locking separate step - reject caching/set-domain on discrete - Enable pipe DMC loading on XE-LPD and ADL-P - add PSF GV point support - Refactor and fix DDI buffer translations - Clean up FBC CFB allocation code - Finish INTEL_GEN() and friends macro conversions nouveau: - add eDP backlight support - implicit fence fix msm: - a680/7c3 support - drm/scheduler conversion panfrost: - rework GPU reset virtio: - fix fencing for planes ast: - add detect support bochs: - move to tiny GPU driver vc4: - use hotplug irqs - HDMI codec support vmwgfx: - use internal vmware device headers ingenic: - demidlayering irq rcar-du: - shutdown fixes - convert to bridge connector helpers zynqmp-dsub: - misc fixes mgag200: - convert PLL handling to atomic mediatek: - MT8133 AAL support - gem mmap object support - MT8167 support etnaviv: - NXP Layerscape LS1028A SoC support - GEM mmap cleanups tegra: - new user API exynos: - missing unlock fix - build warning fix - use refcount_t" * tag 'drm-next-2021-08-31-1' of git://anongit.freedesktop.org/drm/drm: (1318 commits) drm/amd/display: Move AllowDRAMSelfRefreshOrDRAMClockChangeInVblank to bounding box drm/amd/display: Remove duplicate dml init drm/amd/display: Update bounding box states (v2) drm/amd/display: Update number of DCN3 clock states drm/amdgpu: disable GFX CGCG in aldebaran drm/amdgpu: Clear RAS interrupt status on aldebaran drm/amdgpu: Add support for RAS XGMI err query drm/amdkfd: Account for SH/SE count when setting up cu masks. drm/amdgpu: rename amdgpu_bo_get_preferred_pin_domain drm/amdgpu: drop redundant cancel_delayed_work_sync call drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend drm/amdkfd: map SVM range with correct access permission drm/amdkfd: check access permisson to restore retry fault drm/amdgpu: Update RAS XGMI Error Query drm/amdgpu: Add driver infrastructure for MCA RAS drm/amd/display: Add Logging for HDMI color depth information drm/amd/amdgpu: consolidate PSP TA init shared buf functions drm/amd/amdgpu: add name field back to ras_common_if drm/amdgpu: Fix build with missing pm_suspend_target_state module export ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c120
1 files changed, 63 insertions, 57 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc66aca28594..96a8fd0ca1df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -64,15 +64,14 @@ const char *ras_block_string[] = {
};
#define ras_err_str(i) (ras_error_string[ffs(i)])
-#define ras_block_str(i) (ras_block_string[i])
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
/* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
-/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
-#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
+/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
+#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
@@ -355,8 +354,9 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* to see which blocks support RAS on a particular asic.
*
*/
-static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
- size_t size, loff_t *pos)
+static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
+ const char __user *buf,
+ size_t size, loff_t *pos)
{
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
struct ras_debug_if data;
@@ -370,7 +370,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
if (ret)
- return -EINVAL;
+ return ret;
if (data.op == 3) {
ret = amdgpu_reserve_page_direct(adev, data.inject.address);
@@ -403,9 +403,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
/* umc ce/ue error injection for a bad page is not allowed */
if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
amdgpu_ras_check_bad_page(adev, data.inject.address)) {
- dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
- "as bad before error injection!\n",
- data.inject.address);
+ dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
+ "already been marked as bad!\n",
+ data.inject.address);
break;
}
@@ -439,21 +439,24 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
* will reset EEPROM table to 0 entries.
*
*/
-static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
- size_t size, loff_t *pos)
+static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
+ const char __user *buf,
+ size_t size, loff_t *pos)
{
struct amdgpu_device *adev =
(struct amdgpu_device *)file_inode(f)->i_private;
int ret;
ret = amdgpu_ras_eeprom_reset_table(
- &(amdgpu_ras_get_context(adev)->eeprom_control));
+ &(amdgpu_ras_get_context(adev)->eeprom_control));
- if (ret == 1) {
+ if (!ret) {
+ /* Something was written to EEPROM.
+ */
amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
return size;
} else {
- return -EIO;
+ return ret;
}
}
@@ -526,7 +529,7 @@ static inline void put_obj(struct ras_manager *obj)
if (obj && (--obj->use == 0))
list_del(&obj->node);
if (obj && (obj->use < 0))
- DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
+ DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
}
/* make one obj and return it. */
@@ -789,7 +792,6 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
.type = default_ras_type,
.sub_block_index = 0,
};
- strcpy(head.name, ras_block_str(i));
if (bypass) {
/*
* bypass psp. vbios enable ras for us.
@@ -1316,6 +1318,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
&con->bad_page_cnt_threshold);
debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
+ debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
+ &amdgpu_ras_debugfs_eeprom_size_ops);
+ con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
+ S_IRUGO, dir, adev,
+ &amdgpu_ras_debugfs_eeprom_table_ops);
+ amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
/*
* After one uncorrectable error happens, usually GPU recovery will
@@ -1833,13 +1841,12 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
control = &con->eeprom_control;
data = con->eh_data;
- save_count = data->count - control->num_recs;
+ save_count = data->count - control->ras_num_recs;
/* only new entries are saved */
if (save_count > 0) {
- if (amdgpu_ras_eeprom_process_recods(control,
- &data->bps[control->num_recs],
- true,
- save_count)) {
+ if (amdgpu_ras_eeprom_append(control,
+ &data->bps[control->ras_num_recs],
+ save_count)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
@@ -1857,28 +1864,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{
struct amdgpu_ras_eeprom_control *control =
- &adev->psp.ras.ras->eeprom_control;
- struct eeprom_table_record *bps = NULL;
- int ret = 0;
+ &adev->psp.ras_context.ras->eeprom_control;
+ struct eeprom_table_record *bps;
+ int ret;
/* no bad page record, skip eeprom access */
- if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
- return ret;
+ if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
+ return 0;
- bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
+ bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
if (!bps)
return -ENOMEM;
- if (amdgpu_ras_eeprom_process_recods(control, bps, false,
- control->num_recs)) {
+ ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
+ if (ret)
dev_err(adev->dev, "Failed to load EEPROM table records!");
- ret = -EIO;
- goto out;
- }
-
- ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+ else
+ ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
-out:
kfree(bps);
return ret;
}
@@ -1918,11 +1921,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
}
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
- uint32_t max_length)
+ uint32_t max_count)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- int tmp_threshold = amdgpu_bad_page_threshold;
- u64 val;
/*
* Justification of value bad_page_cnt_threshold in ras structure
@@ -1943,18 +1944,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
* take no effect.
*/
- if (tmp_threshold < -1)
- tmp_threshold = -1;
- else if (tmp_threshold > max_length)
- tmp_threshold = max_length;
+ if (amdgpu_bad_page_threshold < 0) {
+ u64 val = adev->gmc.mc_vram_size;
- if (tmp_threshold == -1) {
- val = adev->gmc.mc_vram_size;
- do_div(val, RAS_BAD_PAGE_RATE);
+ do_div(val, RAS_BAD_PAGE_COVER);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
- max_length);
+ max_count);
} else {
- con->bad_page_cnt_threshold = tmp_threshold;
+ con->bad_page_cnt_threshold = min_t(int, max_count,
+ amdgpu_bad_page_threshold);
}
}
@@ -1962,15 +1960,24 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
- uint32_t max_eeprom_records_len = 0;
+ u32 max_eeprom_records_count = 0;
bool exc_err_limit = false;
int ret;
- if (adev->ras_enabled && con)
- data = &con->eh_data;
- else
+ if (!con)
return 0;
+ /* Allow access to RAS EEPROM via debugfs, when the ASIC
+ * supports RAS and debugfs is enabled, but when
+ * adev->ras_enabled is unset, i.e. when "ras_enable"
+ * module parameter is set to 0.
+ */
+ con->adev = adev;
+
+ if (!adev->ras_enabled)
+ return 0;
+
+ data = &con->eh_data;
*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
if (!*data) {
ret = -ENOMEM;
@@ -1980,10 +1987,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
- con->adev = adev;
- max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
- amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+ max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
+ amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
/* Todo: During test the SMU might fail to read the eeprom through I2C
* when the GPU is pending on XGMI reset during probe time
@@ -1999,13 +2005,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
if (exc_err_limit || ret)
goto free;
- if (con->eeprom_control.num_recs) {
+ if (con->eeprom_control.ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
goto free;
if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
- adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs);
+ adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
}
return 0;
@@ -2015,7 +2021,7 @@ free:
kfree(*data);
con->eh_data = NULL;
out:
- dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
+ dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
/*
* Except error threshold exceeding case, other failure cases in this