diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 19:04:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 19:04:27 -0700 |
commit | be8454afc50f43016ca8b6130d9673bdd0bd56ec (patch) | |
tree | 897e49c1ccadeed9b083a3ffc13f0dd2d6d7d874 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | fec88ab0af9706b2201e5daf377c5031c62d11f7 (diff) | |
parent | 3729fe2bc2a01f4cc1aa88be8f64af06084c87d6 (diff) |
Merge tag 'drm-next-2019-07-16' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie:
"The biggest thing in this is the AMD Navi GPU support, this again
contains a bunch of header files that are large. These are the new AMD
RX5700 GPUs that just recently became available.
New drivers:
- ST-Ericsson MCDE driver
- Ingenic JZ47xx SoC
UAPI change:
- HDR source metadata property
Core:
- HDR inforframes and EDID parsing
- drm hdmi infoframe unpacking
- remove prime sg_table caching into dma-buf
- New gem vram helpers to reduce driver code
- Lots of drmP.h removal
- reservation fencing fix
- documentation updates
- drm_fb_helper_connector removed
- mode name command handler rewrite
fbcon:
- Remove the fbcon notifiers
ttm:
- forward progress fixes
dma-buf:
- make mmap call optional
- debugfs refcount fixes
- dma-fence free with pending signals fix
- each dma-buf gets an inode
Panels:
- Lots of additional panel bindings
amdgpu:
- initial navi10 support
- avoid hw reset
- HDR metadata support
- new thermal sensors for vega asics
- RAS fixes
- use HMM rather than MMU notifier
- xgmi topology via kfd
- SR-IOV fixes
- driver reload fixes
- DC use a core bpc attribute
- Aux fixes for DC
- Bandwidth calc updates for DC
- Clock handling refactor
- kfd VEGAM support
vmwgfx:
- Coherent memory support changes
i915:
- HDR Support
- HDMI i2c link
- Icelake multi-segmented gamma support
- GuC firmware update
- Mule Creek Canyon PCH support for EHL
- EHL platform updtes
- move i915.alpha_support to i915.force_probe
- runtime PM refactoring
- VBT parsing refactoring
- DSI fixes
- struct mutex dependency reduction
- GEM code reorg
mali-dp:
- Komeda driver features
msm:
- dsi vs EPROBE_DEFER fixes
- msm8998 snapdragon 835 support
- a540 gpu support
- mdp5 and dpu interconnect support
exynos:
- drmP.h removal
tegra:
- misc fixes
tda998x:
- audio support improvements
- pixel repeated mode support
- quantisation range handling corrections
- HDMI vendor info fix
armada:
- interlace support fix
- overlay/video plane register handling refactor
- add gamma support
rockchip:
- RX3328 support
panfrost:
- expose perf counters via hidden ioctls
vkms:
- enumerate CRC sources list
ast:
- rework BO handling
mgag200:
- rework BO handling
dw-hdmi:
- suspend/resume support
rcar-du:
- R8A774A1 Soc Support
- LVDS dual-link mode support
- Additional formats
- Misc fixes
omapdrm:
- DSI command mode display support
stm
- fb modifier support
- runtime PM support
sun4i:
- use vmap ops
vc4:
- binner bo binding rework
v3d:
- compute shader support
- resync/sync fixes
- job management refactoring
lima:
- NULL pointer in irq handler fix
- scheduler default timeout
virtio:
- fence seqno support
- trace events
bochs:
- misc fixes
tc458767:
- IRQ/HDP handling
sii902x:
- HDMI audio support
atmel-hlcdc:
- misc fixes
meson:
- zpos support"
* tag 'drm-next-2019-07-16' of git://anongit.freedesktop.org/drm/drm: (1815 commits)
Revert "Merge branch 'vmwgfx-next' of git://people.freedesktop.org/~thomash/linux into drm-next"
Revert "mm: adjust apply_to_pfn_range interface for dropped token."
mm: adjust apply_to_pfn_range interface for dropped token.
drm/amdgpu/navi10: add uclk activity sensor
drm/amdgpu: properly guard the generic discovery code
drm/amdgpu: add missing documentation on new module parameters
drm/amdgpu: don't invalidate caches in RELEASE_MEM, only do the writeback
drm/amd/display: avoid 64-bit division
drm/amdgpu/psp11: simplify the ucode register logic
drm/amdgpu: properly guard DC support in navi code
drm/amd/powerplay: vega20: fix uninitialized variable use
drm/amd/display: dcn20: include linux/delay.h
amdgpu: make pmu support optional
drm/amd/powerplay: Zero initialize current_rpm in vega20_get_fan_speed_percent
drm/amd/powerplay: Zero initialize freq in smu_v11_0_get_current_clk_freq
drm/amd/powerplay: Use memset to initialize metrics structs
drm/amdgpu/mes10.1: Fix header guard
drm/amd/powerplay: add temperature sensor support for navi10
drm/amdgpu: fix scheduler timeout calc
drm/amdgpu: Prepare for hmm_range_register API change (v2)
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 302 |
1 files changed, 250 insertions, 52 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 22bd21efe6b1..1a4412e47810 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -24,6 +24,8 @@ #include <linux/debugfs.h> #include <linux/list.h> #include <linux/module.h> +#include <linux/uaccess.h> + #include "amdgpu.h" #include "amdgpu_ras.h" #include "amdgpu_atomfirmware.h" @@ -90,6 +92,12 @@ struct ras_manager { struct ras_err_data err_data; }; +struct ras_badpage { + unsigned int bp; + unsigned int size; + unsigned int flags; +}; + const char *ras_error_string[] = { "none", "parity", @@ -118,9 +126,16 @@ const char *ras_block_string[] = { #define ras_err_str(i) (ras_error_string[ffs(i)]) #define ras_block_str(i) (ras_block_string[i]) -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 +#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 +#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) +static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, + uint64_t offset, uint64_t size, + struct amdgpu_bo **bo_ptr); +static int amdgpu_ras_release_vram(struct amdgpu_device *adev, + struct amdgpu_bo **bo_ptr); + static void amdgpu_ras_self_test(struct amdgpu_device *adev) { /* TODO */ @@ -237,8 +252,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } -/* - * DOC: ras debugfs control interface +/** + * DOC: AMDGPU RAS debugfs control interface * * It accepts struct ras_debug_if who has two members. * @@ -300,6 +315,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * { struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; struct ras_debug_if data; + struct amdgpu_bo *bo; int ret = 0; ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); @@ -317,7 +333,17 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ret = amdgpu_ras_feature_enable(adev, &data.head, 1); break; case 2: + ret = amdgpu_ras_reserve_vram(adev, + data.inject.address, PAGE_SIZE, &bo); + if (ret) { + /* address was offset, now it is absolute.*/ + data.inject.address += adev->gmc.vram_start; + if (data.inject.address > adev->gmc.vram_end) + break; + } else + data.inject.address = amdgpu_bo_gpu_offset(bo); ret = amdgpu_ras_error_inject(adev, &data.inject); + amdgpu_ras_release_vram(adev, &bo); break; default: ret = -EINVAL; @@ -521,6 +547,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, enable ? "enable":"disable", ras_block_str(head->block), ret); + if (ret == TA_RAS_STATUS__RESET_NEEDED) + return -EAGAIN; return -EINVAL; } @@ -541,16 +569,32 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, return -EINVAL; if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { - /* If ras is enabled by vbios, we set up ras object first in - * both case. For enable, that is all what we need do. For - * disable, we need perform a ras TA disable cmd after that. - */ - ret = __amdgpu_ras_feature_enable(adev, head, 1); - if (ret) - return ret; + if (enable) { + /* There is no harm to issue a ras TA cmd regardless of + * the currecnt ras state. + * If current state == target state, it will do nothing + * But sometimes it requests driver to reset and repost + * with error code -EAGAIN. + */ + ret = amdgpu_ras_feature_enable(adev, head, 1); + /* With old ras TA, we might fail to enable ras. + * Log it and just setup the object. + * TODO need remove this WA in the future. + */ + if (ret == -EINVAL) { + ret = __amdgpu_ras_feature_enable(adev, head, 1); + if (!ret) + DRM_INFO("RAS INFO: %s setup object\n", + ras_block_str(head->block)); + } + } else { + /* setup the object then issue a ras TA disable cmd.*/ + ret = __amdgpu_ras_feature_enable(adev, head, 1); + if (ret) + return ret; - if (!enable) ret = amdgpu_ras_feature_enable(adev, head, 0); + } } else ret = amdgpu_ras_feature_enable(adev, head, enable); @@ -691,6 +735,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, /* sysfs begin */ +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, + struct ras_badpage **bps, unsigned int *count); + +static char *amdgpu_ras_badpage_flags_str(unsigned int flags) +{ + switch (flags) { + case 0: + return "R"; + case 1: + return "P"; + case 2: + default: + return "F"; + }; +} + +/* + * DOC: ras sysfs gpu_vram_bad_pages interface + * + * It allows user to read the bad pages of vram on the gpu through + * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages + * + * It outputs multiple lines, and each line stands for one gpu page. + * + * The format of one line is below, + * gpu pfn : gpu page size : flags + * + * gpu pfn and gpu page size are printed in hex format. + * flags can be one of below character, + * R: reserved, this gpu page is reserved and not able to use. + * P: pending for reserve, this gpu page is marked as bad, will be reserved + * in next window of page_reserve. + * F: unable to reserve. this gpu page can't be reserved due to some reasons. + * + * examples: + * 0x00000001 : 0x00001000 : R + * 0x00000002 : 0x00001000 : P + */ + +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, + struct kobject *kobj, struct bin_attribute *attr, + char *buf, loff_t ppos, size_t count) +{ + struct amdgpu_ras *con = + container_of(attr, struct amdgpu_ras, badpages_attr); + struct amdgpu_device *adev = con->adev; + const unsigned int element_size = + sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; + unsigned int start = div64_ul(ppos + element_size - 1, element_size); + unsigned int end = div64_ul(ppos + count - 1, element_size); + ssize_t s = 0; + struct ras_badpage *bps = NULL; + unsigned int bps_count = 0; + + memset(buf, 0, count); + + if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) + return 0; + + for (; start < end && start < bps_count; start++) + s += scnprintf(&buf[s], element_size + 1, + "0x%08x : 0x%08x : %1s\n", + bps[start].bp, + bps[start].size, + amdgpu_ras_badpage_flags_str(bps[start].flags)); + + kfree(bps); + + return s; +} + static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, struct device_attribute *attr, char *buf) { @@ -731,9 +846,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) &con->features_attr.attr, NULL }; + struct bin_attribute *bin_attrs[] = { + &con->badpages_attr, + NULL + }; struct attribute_group group = { .name = "ras", .attrs = attrs, + .bin_attrs = bin_attrs, }; con->features_attr = (struct device_attribute) { @@ -743,7 +863,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) }, .show = amdgpu_ras_sysfs_features_read, }; + + con->badpages_attr = (struct bin_attribute) { + .attr = { + .name = "gpu_vram_bad_pages", + .mode = S_IRUGO, + }, + .size = 0, + .private = NULL, + .read = amdgpu_ras_sysfs_badpages_read, + }; + sysfs_attr_init(attrs[0]); + sysfs_bin_attr_init(bin_attrs[0]); return sysfs_create_group(&adev->dev->kobj, &group); } @@ -755,9 +887,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) &con->features_attr.attr, NULL }; + struct bin_attribute *bin_attrs[] = { + &con->badpages_attr, + NULL + }; struct attribute_group group = { .name = "ras", .attrs = attrs, + .bin_attrs = bin_attrs, }; sysfs_remove_group(&adev->dev->kobj, &group); @@ -833,40 +970,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) /* sysfs end */ /* debugfs begin */ -static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) +static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct drm_minor *minor = adev->ddev->primary; - struct dentry *root = minor->debugfs_root, *dir; - struct dentry *ent; - dir = debugfs_create_dir("ras", root); - if (IS_ERR(dir)) - return -EINVAL; - - con->dir = dir; - - ent = debugfs_create_file("ras_ctrl", - S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_ctrl_ops); - if (IS_ERR(ent)) { - debugfs_remove(con->dir); - return -EINVAL; - } - - con->ent = ent; - return 0; + con->dir = debugfs_create_dir("ras", minor->debugfs_root); + con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_ctrl_ops); } -int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, +void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, struct ras_fs_if *head) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); - struct dentry *ent; if (!obj || obj->ent) - return -EINVAL; + return; get_obj(obj); @@ -874,34 +995,25 @@ int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, head->debugfs_name, sizeof(obj->fs_data.debugfs_name)); - ent = debugfs_create_file(obj->fs_data.debugfs_name, - S_IWUGO | S_IRUGO, con->dir, - obj, &amdgpu_ras_debugfs_ops); - - if (IS_ERR(ent)) - return -EINVAL; - - obj->ent = ent; - - return 0; + obj->ent = debugfs_create_file(obj->fs_data.debugfs_name, + S_IWUGO | S_IRUGO, con->dir, obj, + &amdgpu_ras_debugfs_ops); } -int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, +void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, struct ras_common_if *head) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); if (!obj || !obj->ent) - return 0; + return; debugfs_remove(obj->ent); obj->ent = NULL; put_obj(obj); - - return 0; } -static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) +static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; @@ -914,8 +1026,6 @@ static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) debugfs_remove(con->dir); con->dir = NULL; con->ent = NULL; - - return 0; } /* debugfs end */ @@ -1089,6 +1199,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) /* ih end */ /* recovery begin */ + +/* return 0 on success. + * caller need free bps. + */ +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, + struct ras_badpage **bps, unsigned int *count) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data; + int i = 0; + int ret = 0; + + if (!con || !con->eh_data || !bps || !count) + return -EINVAL; + + mutex_lock(&con->recovery_lock); + data = con->eh_data; + if (!data || data->count == 0) { + *bps = NULL; + goto out; + } + + *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); + if (!*bps) { + ret = -ENOMEM; + goto out; + } + + for (; i < data->count; i++) { + (*bps)[i] = (struct ras_badpage){ + .bp = data->bps[i].bp, + .size = AMDGPU_GPU_PAGE_SIZE, + .flags = 0, + }; + + if (data->last_reserved <= i) + (*bps)[i].flags = 1; + else if (data->bps[i].bo == NULL) + (*bps)[i].flags = 2; + } + + *count = data->count; +out: + mutex_unlock(&con->recovery_lock); + return ret; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -1340,6 +1497,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) } /* recovery end */ +/* return 0 if ras will reset gpu and repost.*/ +int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, + unsigned int block) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!ras) + return -EINVAL; + + ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; + return 0; +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -1415,8 +1585,10 @@ recovery_out: return -EINVAL; } -/* do some init work after IP late init as dependence */ -void amdgpu_ras_post_init(struct amdgpu_device *adev) +/* do some init work after IP late init as dependence. + * and it runs in resume/gpu reset/booting up cases. + */ +void amdgpu_ras_resume(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; @@ -1444,6 +1616,32 @@ void amdgpu_ras_post_init(struct amdgpu_device *adev) } } } + + if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { + con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; + /* setup ras obj state as disabled. + * for init_by_vbios case. + * if we want to enable ras, just enable it in a normal way. + * If we want do disable it, need setup ras obj as enabled, + * then issue another TA disable cmd. + * See feature_enable_on_boot + */ + amdgpu_ras_disable_all_features(adev, 1); + amdgpu_ras_reset_gpu(adev, 0); + } +} + +void amdgpu_ras_suspend(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return; + + amdgpu_ras_disable_all_features(adev, 0); + /* Make sure all ras objects are disabled. */ + if (con->features) + amdgpu_ras_disable_all_features(adev, 1); } /* do some fini work before IP fini as dependence */ |