summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-30 12:44:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-30 12:44:02 -0700
commit95275402f66e88c56144a2d859c13594b651b29b (patch)
treebf5e06c882703cd2ec7ef9310ba982c1a76faeac /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent65c61de9d090edb8a3cfb3f45541e268eb2cdb13 (diff)
parent1cd6b4a04f038eb24fd18c8010e763d1140a9c7a (diff)
Merge tag 'drm-next-2021-04-30' of git://anongit.freedesktop.org/drm/drm
Pull more drm updates from Dave Airlie: "Looks like I missed a tegra feature request for next, but should still be fine since it's pretty self contained. Apart from that got a set of i915 and amdgpu fixes as per usual along with a few misc fixes. tegra: - Tegra186 hardware cursor support - better capability reporting for different SoC - better framebuffer modifier support - host1x fixes ttm: - fix unswappable BO handling efifb: - check for PCI before using it amdgpu: - Fixes for Aldebaran - Display LTTPR fixes - eDP fixes - Fixes for Vangogh - RAS fixes - ASPM support - Renoir SMU fixes - Modifier fixes - Misc code cleanups - Freesync fixes i915: - Several fixes to GLK handling in recent display refactoring - Rare watchdog timer race fix - Cppcheck redundant condition fix - Overlay error code propagation fix - Documentation fix - gvt: Remove one unused function warning - gvt: Fix intel_gvt_init_device() return type - gvt: Remove one duplicated register accessible check" * tag 'drm-next-2021-04-30' of git://anongit.freedesktop.org/drm/drm: (111 commits) efifb: Check efifb_pci_dev before using it drm/i915: Fix docbook descriptions for i915_gem_shrinker drm/i915: fix an error code in intel_overlay_do_put_image() drm/i915/display/psr: Fix cppcheck warnings drm/i915: Disable LTTPR detection on GLK once again drm/i915: Restore lost glk ccs w/a drm/i915: Restore lost glk FBC 16bpp w/a drm/i915: Take request reference before arming the watchdog timer drm/ttm: fix error handling if no BO can be swapped out v4 drm/i915/gvt: Remove duplicated register accessible check drm/amdgpu/gmc9: remove dummy read workaround for newer chips drm/amdgpu: Add mem sync flag for IB allocated by SA drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran drm/amdgpu: Reset RAS error count and status regs Revert "drm/amdgpu: workaround the TMR MC address issue (v2)" drm/amd/display: 3.2.132 drm/amd/display: [FW Promotion] Release 0.0.62 drm/amd/display: add helper for enabling mst stream features drm/amd/display: Report Proper Quantization Range in AVI Infoframe drm/amd/display: Fix call to pass bpp in 16ths of a bit ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c82
1 files changed, 44 insertions, 38 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0541196ae1ed..b0d2fc9454ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -114,7 +114,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
if (amdgpu_ras_check_bad_page(adev, address)) {
dev_warn(adev->dev,
- "RAS WARN: 0x%llx has been marked as bad page!\n",
+ "RAS WARN: 0x%llx has already been marked as bad page!\n",
address);
return 0;
}
@@ -221,18 +221,17 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
- else if (sscanf(str, "retire_page") == 0)
+ else if (strstr(str, "retire_page") != NULL)
op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
if (op != -1) {
-
if (op == 3) {
- if (sscanf(str, "%*s %llu", &address) != 1)
- if (sscanf(str, "%*s 0x%llx", &address) != 1)
- return -EINVAL;
+ if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
+ sscanf(str, "%*s %llu", &address) != 1)
+ return -EINVAL;
data->op = op;
data->inject.address = address;
@@ -255,11 +254,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->op = op;
if (op == 2) {
- if (sscanf(str, "%*s %*s %*s %u %llu %llu",
- &sub_block, &address, &value) != 3)
- if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
- &sub_block, &address, &value) != 3)
- return -EINVAL;
+ if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
+ &sub_block, &address, &value) != 3 &&
+ sscanf(str, "%*s %*s %*s %u %llu %llu",
+ &sub_block, &address, &value) != 3)
+ return -EINVAL;
data->head.sub_block_index = sub_block;
data->inject.address = address;
data->inject.value = value;
@@ -278,7 +277,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
/**
* DOC: AMDGPU RAS debugfs control interface
*
- * It accepts struct ras_debug_if who has two members.
+ * The control interface accepts struct ras_debug_if which has two members.
*
* First member: ras_debug_if::head or ras_debug_if::inject.
*
@@ -303,32 +302,33 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
*
* How to use the interface?
*
- * Programs
+ * In a program
*
- * Copy the struct ras_debug_if in your codes and initialize it.
- * Write the struct to the control node.
+ * Copy the struct ras_debug_if in your code and initialize it.
+ * Write the struct to the control interface.
*
- * Shells
+ * From shell
*
* .. code-block:: bash
*
- * echo op block [error [sub_block address value]] > .../ras/ras_ctrl
+ * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ * echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
*
- * Parameters:
+ * Where N, is the card which you want to affect.
*
- * op: disable, enable, inject
- * disable: only block is needed
- * enable: block and error are needed
- * inject: error, address, value are needed
- * block: umc, sdma, gfx, .........
+ * "disable" requires only the block.
+ * "enable" requires the block and error type.
+ * "inject" requires the block, error type, address, and value.
+ * The block is one of: umc, sdma, gfx, etc.
* see ras_block_string[] for details
- * error: ue, ce
- * ue: multi_uncorrectable
- * ce: single_correctable
- * sub_block:
- * sub block index, pass 0 if there is no sub block
+ * The error type is one of: ue, ce, where,
+ * ue is multi-uncorrectable
+ * ce is single-correctable
+ * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
+ * The address and value are hexadecimal numbers, leading 0x is optional.
*
- * here are some examples for bash commands:
+ * For instance,
*
* .. code-block:: bash
*
@@ -336,17 +336,17 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
*
- * How to check the result?
+ * How to check the result of the operation?
*
- * For disable/enable, please check ras features at
+ * To check disable/enable, see "ras" features at,
* /sys/class/drm/card[0/1/2...]/device/ras/features
*
- * For inject, please check corresponding err count at
- * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
+ * To check inject, see the corresponding error count at,
+ * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
*
* .. note::
* Operations are only allowed on blocks which are supported.
- * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
+ * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
* to see which blocks support RAS on a particular asic.
*
*/
@@ -367,11 +367,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
if (ret)
return -EINVAL;
- if (data.op == 3)
- {
+ if (data.op == 3) {
ret = amdgpu_reserve_page_direct(adev, data.inject.address);
-
- if (ret)
+ if (!ret)
return size;
else
return ret;
@@ -503,6 +501,12 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
+
+ if (obj->adev->asic_type == CHIP_ALDEBARAN) {
+ if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
+ DRM_WARN("Failed to reset error counter and error status");
+ }
+
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
"ce", info.ce_count);
}
@@ -1269,6 +1273,8 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
&amdgpu_ras_debugfs_ctrl_ops);
debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
&amdgpu_ras_debugfs_eeprom_ops);
+ debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
+ &con->bad_page_cnt_threshold);
/*
* After one uncorrectable error happens, usually GPU recovery will