summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 19:26:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 19:26:49 -0700
commit260f6f4fda93c8485c8037865c941b42b9cba5d2 (patch)
tree587a0ea46d3351f63250d19860b01da8217ac774 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent63eb28bb1402891b1ad2be02a530f29a9dd7f1cd (diff)
parent711fa2667d8b230ff31f1855d0f25e3263268d8a (diff)
Merge tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel
Pull drm updates from Dave Airlie: "Highlights: - Intel xe enable Panthor Lake, started adding WildCat Lake - amdgpu has a bunch of reset improvments along with the usual IP updates - msm got VM_BIND support which is important for vulkan sparse memory - more drm_panic users - gpusvm common code to handle a bunch of core SVM work outside drivers. Detail summary: Changes outside drm subdirectory: - 'shrink_shmem_memory()' for better shmem/hibernate interaction - Rust support infrastructure: - make ETIMEDOUT available - add size constants up to SZ_2G - add DMA coherent allocation bindings - mtd driver for Intel GPU non-volatile storage - i2c designware quirk for Intel xe core: - atomic helpers: tune enable/disable sequences - add task info to wedge API - refactor EDID quirks - connector: move HDR sink to drm_display_info - fourcc: half-float and 32-bit float formats - mode_config: pass format info to simplify dma-buf: - heaps: Give CMA heap a stable name ci: - add device tree validation and kunit displayport: - change AUX DPCD access probe address - add quirk for DPCD probe - add panel replay definitions - backlight control helpers fbdev: - make CONFIG_FIRMWARE_EDID available on all arches fence: - fix UAF issues format-helper: - improve tests gpusvm: - introduce devmem only flag for allocation - add timeslicing support to GPU SVM ttm: - improve eviction sched: - tracing improvements - kunit improvements - memory leak fixes - reset handling improvements color mgmt: - add hardware gamma LUT handling helpers bridge: - add destroy hook - switch to reference counted drm_bridge allocations - tc358767: convert to devm_drm_bridge_alloc - improve CEC handling panel: - switch to reference counter drm_panel allocations - fwnode panel lookup - Huiling hl055fhv028c support - Raspberry Pi 7" 720x1280 support - edp: KDC KD116N3730A05, N160JCE-ELL CMN, N116BCJ-EAK - simple: AUO P238HAN01 - st7701: Winstar wf40eswaa6mnn0 - visionox: rm69299-shift - Renesas R61307, Renesas R69328 support - DJN HX83112B hdmi: - add CEC handling - YUV420 output support xe: - WildCat Lake support - Enable PanthorLake by default - mark BMG as SRIOV capable - update firmware recommendations - Expose media OA units - aux-bux support for non-volatile memory - MTD intel-dg driver for non-volatile memory - Expose fan control and voltage regulator in sysfs - restructure migration for multi-device - Restore GuC submit UAF fix - make GEM shrinker drm managed - SRIOV VF Post-migration recovery of GGTT nodes - W/A additions/reworks - Prefetch support for svm ranges - Don't allocate managed BO for each policy change - HWMON fixes for BMG - Create LRC BO without VM - PCI ID updates - make SLPC debugfs files optional - rework eviction rejection of bound external BOs - consolidate PAT programming logic for pre/post Xe2 - init changes for flicker-free boot - Enable GuC Dynamic Inhibit Context switch i915: - drm_panic support for i915/xe - initial flip queue off by default for LNL/PNL - Wildcat Lake Display support - Support for DSC fractional link bpp - Support for simultaneous Panel Replay and Adaptive sync - Support for PTL+ double buffer LUT - initial PIPEDMC event handling - drm_panel_follower support - DPLL interface renames - allocate struct intel_display dynamically - flip queue preperation - abstract DRAM detection better - avoid GuC scheduling stalls - remove DG1 force probe requirement - fix MEI interrupt handler on RT kernels - use backlight control helpers for eDP - more shared display code refactoring amdgpu: - add userq slot to INFO ioctl - SR-IOV hibernation support - Suspend improvements - Backlight improvements - Use scaling for non-native eDP modes - cleaner shader updates for GC 9.x - Remove fence slab - SDMA fw checks for userq support - RAS updates - DMCUB updates - DP tunneling fixes - Display idle D3 support - Per queue reset improvements - initial smartmux support amdkfd: - enable KFD on loongarch - mtype fix for ext coherent system memory radeon: - CS validation additional GL extensions - drop console lock during suspend/resume - bump driver version msm: - VM BIND support - CI: infrastructure updates - UBWC single source of truth - decouple GPU and KMS support - DP: rework I/O accessors - DPU: SM8750 support - DSI: SM8750 support - GPU: X1-45 support and speedbin support for X1-85 - MDSS: SM8750 support nova: - register! macro improvements - DMA object abstraction - VBIOS parser + fwsec lookup - sysmem flush page support - falcon: generic falcon boot code and HAL - FWSEC-FRTS: fb setup and load/execute ivpu: - Add Wildcat Lake support - Add turbo flag ast: - improve hardware generations implementation imx: - IMX8qxq Display Controller support lima: - Rockchip RK3528 GPU support nouveau: - fence handling cleanup panfrost: - MT8370 support - bo labeling - 64-bit register access qaic: - add RAS support rockchip: - convert inno_hdmi to a bridge rz-du: - add RZ/V2H(P) support - MIPI-DSI DCS support sitronix: - ST7567 support sun4i: - add H616 support tidss: - add TI AM62L support - AM65x OLDI bridge support bochs: - drm panic support vkms: - YUV and R* format support - use faux device vmwgfx: - fence improvements hyperv: - move out of simple - add drm_panic support" * tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel: (1479 commits) drm/tidss: oldi: convert to devm_drm_bridge_alloc() API drm/tidss: encoder: convert to devm_drm_bridge_alloc() drm/amdgpu: move reset support type checks into the caller drm/amdgpu/sdma7: re-emit unprocessed state on ring reset drm/amdgpu/sdma6: re-emit unprocessed state on ring reset drm/amdgpu/sdma5.2: re-emit unprocessed state on ring reset drm/amdgpu/sdma5: re-emit unprocessed state on ring reset drm/amdgpu/gfx12: re-emit unprocessed state on ring reset drm/amdgpu/gfx11: re-emit unprocessed state on ring reset drm/amdgpu/gfx10: re-emit unprocessed state on ring reset drm/amdgpu/gfx9.4.3: re-emit unprocessed state on kcq reset drm/amdgpu/gfx9: re-emit unprocessed state on kcq reset drm/amdgpu: Add WARN_ON to the resource clear function drm/amd/pm: Use cached metrics data on SMUv13.0.6 drm/amd/pm: Use cached data for min/max clocks gpu: nova-core: fix bounds check in PmuLookupTableEntry::new drm/amdgpu: Replace HQD terminology with slots naming drm/amdgpu: Add user queue instance count in HW IP info drm/amd/amdgpu: Add helper functions for isp buffers drm/amd/amdgpu: Initialize swnode for ISP MFD device ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c55
1 files changed, 38 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9c8829bd5a58..540817e296da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1107,6 +1107,9 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
err_info->de_count, blk_name);
}
} else {
+ if (adev->debug_disable_ce_logs)
+ return;
+
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
@@ -2854,6 +2857,13 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
if (amdgpu_umc_pages_in_a_row(adev, err_data,
bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
return -EINVAL;
+ for (i = 0; i < adev->umc.retire_unit; i++) {
+ err_data->err_addr[i].address = bps[0].address;
+ err_data->err_addr[i].mem_channel = bps[0].mem_channel;
+ err_data->err_addr[i].bank = bps[0].bank;
+ err_data->err_addr[i].err_type = bps[0].err_type;
+ err_data->err_addr[i].mcumc_id = bps[0].mcumc_id;
+ }
} else {
if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
return -EINVAL;
@@ -2885,6 +2895,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
struct eeprom_table_record *bps, struct ras_err_data *err_data,
enum amdgpu_memory_partition nps)
{
+ int i = 0;
enum amdgpu_memory_partition save_nps;
save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
@@ -2894,6 +2905,13 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
if (amdgpu_umc_pages_in_a_row(adev, err_data,
bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
return -EINVAL;
+ for (i = 0; i < adev->umc.retire_unit; i++) {
+ err_data->err_addr[i].address = bps->address;
+ err_data->err_addr[i].mem_channel = bps->mem_channel;
+ err_data->err_addr[i].bank = bps->bank;
+ err_data->err_addr[i].err_type = bps->err_type;
+ err_data->err_addr[i].mcumc_id = bps->mcumc_id;
+ }
} else {
if (bps->address) {
if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
@@ -3003,6 +3021,15 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
return 0;
}
+ if (!con->eeprom_control.is_eeprom_valid) {
+ dev_warn(adev->dev,
+ "Failed to save EEPROM table data because of EEPROM data corruption!");
+ if (new_cnt)
+ *new_cnt = 0;
+
+ return 0;
+ }
+
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
@@ -3294,7 +3321,6 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
uint64_t de_queried_count;
uint32_t new_detect_count, total_detect_count;
uint32_t need_query_count = poison_creation_count;
- bool query_data_timeout = false;
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
memset(&info, 0, sizeof(info));
@@ -3323,21 +3349,13 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
if (timeout) {
- if (!--timeout) {
- query_data_timeout = true;
+ if (!--timeout)
break;
- }
msleep(1);
}
}
} while (total_detect_count < need_query_count);
- if (query_data_timeout) {
- dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
- (need_query_count - total_detect_count));
- return -ENOENT;
- }
-
if (total_detect_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
@@ -3488,8 +3506,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
control = &con->eeprom_control;
ret = amdgpu_ras_eeprom_init(control);
- if (ret)
- return ret;
+ control->is_eeprom_valid = !ret;
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
control->ras_num_pa_recs = control->ras_num_recs;
@@ -3498,10 +3515,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev->umc.ras->get_retire_flip_bits)
adev->umc.ras->get_retire_flip_bits(adev);
- if (control->ras_num_recs) {
+ if (control->ras_num_recs && control->is_eeprom_valid) {
ret = amdgpu_ras_load_bad_pages(adev);
- if (ret)
- return ret;
+ if (ret) {
+ control->is_eeprom_valid = false;
+ return 0;
+ }
amdgpu_dpm_send_hbm_bad_pages_num(
adev, control->ras_num_bad_pages);
@@ -3520,7 +3539,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
- return ret;
+ return 0;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
@@ -4414,8 +4433,10 @@ void amdgpu_ras_clear_err_state(struct amdgpu_device *adev)
struct amdgpu_ras *ras;
ras = amdgpu_ras_get_context(adev);
- if (ras)
+ if (ras) {
ras->ras_err_state = 0;
+ ras->gpu_reset_flags = 0;
+ }
}
void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,