diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-30 12:44:02 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-30 12:44:02 -0700 |
commit | 95275402f66e88c56144a2d859c13594b651b29b (patch) | |
tree | bf5e06c882703cd2ec7ef9310ba982c1a76faeac /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 65c61de9d090edb8a3cfb3f45541e268eb2cdb13 (diff) | |
parent | 1cd6b4a04f038eb24fd18c8010e763d1140a9c7a (diff) |
Merge tag 'drm-next-2021-04-30' of git://anongit.freedesktop.org/drm/drm
Pull more drm updates from Dave Airlie:
"Looks like I missed a tegra feature request for next, but should still
be fine since it's pretty self contained.
Apart from that got a set of i915 and amdgpu fixes as per usual along
with a few misc fixes.
tegra:
- Tegra186 hardware cursor support
- better capability reporting for different SoC
- better framebuffer modifier support
- host1x fixes
ttm:
- fix unswappable BO handling
efifb:
- check for PCI before using it
amdgpu:
- Fixes for Aldebaran
- Display LTTPR fixes
- eDP fixes
- Fixes for Vangogh
- RAS fixes
- ASPM support
- Renoir SMU fixes
- Modifier fixes
- Misc code cleanups
- Freesync fixes
i915:
- Several fixes to GLK handling in recent display refactoring
- Rare watchdog timer race fix
- Cppcheck redundant condition fix
- Overlay error code propagation fix
- Documentation fix
- gvt: Remove one unused function warning
- gvt: Fix intel_gvt_init_device() return type
- gvt: Remove one duplicated register accessible check"
* tag 'drm-next-2021-04-30' of git://anongit.freedesktop.org/drm/drm: (111 commits)
efifb: Check efifb_pci_dev before using it
drm/i915: Fix docbook descriptions for i915_gem_shrinker
drm/i915: fix an error code in intel_overlay_do_put_image()
drm/i915/display/psr: Fix cppcheck warnings
drm/i915: Disable LTTPR detection on GLK once again
drm/i915: Restore lost glk ccs w/a
drm/i915: Restore lost glk FBC 16bpp w/a
drm/i915: Take request reference before arming the watchdog timer
drm/ttm: fix error handling if no BO can be swapped out v4
drm/i915/gvt: Remove duplicated register accessible check
drm/amdgpu/gmc9: remove dummy read workaround for newer chips
drm/amdgpu: Add mem sync flag for IB allocated by SA
drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran
drm/amdgpu: Reset RAS error count and status regs
Revert "drm/amdgpu: workaround the TMR MC address issue (v2)"
drm/amd/display: 3.2.132
drm/amd/display: [FW Promotion] Release 0.0.62
drm/amd/display: add helper for enabling mst stream features
drm/amd/display: Report Proper Quantization Range in AVI Infoframe
drm/amd/display: Fix call to pass bpp in 16ths of a bit
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 82 |
1 files changed, 44 insertions, 38 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0541196ae1ed..b0d2fc9454ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -114,7 +114,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre if (amdgpu_ras_check_bad_page(adev, address)) { dev_warn(adev->dev, - "RAS WARN: 0x%llx has been marked as bad page!\n", + "RAS WARN: 0x%llx has already been marked as bad page!\n", address); return 0; } @@ -221,18 +221,17 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; - else if (sscanf(str, "retire_page") == 0) + else if (strstr(str, "retire_page") != NULL) op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; if (op != -1) { - if (op == 3) { - if (sscanf(str, "%*s %llu", &address) != 1) - if (sscanf(str, "%*s 0x%llx", &address) != 1) - return -EINVAL; + if (sscanf(str, "%*s 0x%llx", &address) != 1 && + sscanf(str, "%*s %llu", &address) != 1) + return -EINVAL; data->op = op; data->inject.address = address; @@ -255,11 +254,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->op = op; if (op == 2) { - if (sscanf(str, "%*s %*s %*s %u %llu %llu", - &sub_block, &address, &value) != 3) - if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", - &sub_block, &address, &value) != 3) - return -EINVAL; + if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", + &sub_block, &address, &value) != 3 && + sscanf(str, "%*s %*s %*s %u %llu %llu", + &sub_block, &address, &value) != 3) + return -EINVAL; data->head.sub_block_index = sub_block; data->inject.address = address; data->inject.value = value; @@ -278,7 +277,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, /** * DOC: AMDGPU RAS debugfs control interface * - * It accepts struct ras_debug_if who has two members. + * The control interface accepts struct ras_debug_if which has two members. * * First member: ras_debug_if::head or ras_debug_if::inject. * @@ -303,32 +302,33 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * How to use the interface? * - * Programs + * In a program * - * Copy the struct ras_debug_if in your codes and initialize it. - * Write the struct to the control node. + * Copy the struct ras_debug_if in your code and initialize it. + * Write the struct to the control interface. * - * Shells + * From shell * * .. code-block:: bash * - * echo op block [error [sub_block address value]] > .../ras/ras_ctrl + * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl + * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl + * echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * - * Parameters: + * Where N, is the card which you want to affect. * - * op: disable, enable, inject - * disable: only block is needed - * enable: block and error are needed - * inject: error, address, value are needed - * block: umc, sdma, gfx, ......... + * "disable" requires only the block. + * "enable" requires the block and error type. + * "inject" requires the block, error type, address, and value. + * The block is one of: umc, sdma, gfx, etc. * see ras_block_string[] for details - * error: ue, ce - * ue: multi_uncorrectable - * ce: single_correctable - * sub_block: - * sub block index, pass 0 if there is no sub block + * The error type is one of: ue, ce, where, + * ue is multi-uncorrectable + * ce is single-correctable + * The sub-block is a the sub-block index, pass 0 if there is no sub-block. + * The address and value are hexadecimal numbers, leading 0x is optional. * - * here are some examples for bash commands: + * For instance, * * .. code-block:: bash * @@ -336,17 +336,17 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl * - * How to check the result? + * How to check the result of the operation? * - * For disable/enable, please check ras features at + * To check disable/enable, see "ras" features at, * /sys/class/drm/card[0/1/2...]/device/ras/features * - * For inject, please check corresponding err count at - * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count + * To check inject, see the corresponding error count at, + * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count * * .. note:: * Operations are only allowed on blocks which are supported. - * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask + * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask * to see which blocks support RAS on a particular asic. * */ @@ -367,11 +367,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * if (ret) return -EINVAL; - if (data.op == 3) - { + if (data.op == 3) { ret = amdgpu_reserve_page_direct(adev, data.inject.address); - - if (ret) + if (!ret) return size; else return ret; @@ -503,6 +501,12 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; + + if (obj->adev->asic_type == CHIP_ALDEBARAN) { + if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) + DRM_WARN("Failed to reset error counter and error status"); + } + return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, "ce", info.ce_count); } @@ -1269,6 +1273,8 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * &amdgpu_ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, &amdgpu_ras_debugfs_eeprom_ops); + debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, + &con->bad_page_cnt_threshold); /* * After one uncorrectable error happens, usually GPU recovery will |