diff options
author | Dave Airlie <airlied@redhat.com> | 2021-10-27 10:38:38 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2021-10-27 10:38:41 +1000 |
commit | 367fe8dc299c968eabdae890536d55d80ea55e01 (patch) | |
tree | f5c348ea306b554494fc5b5904086a1d1c72eb55 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 6f2f7c83303d2227f47551423e507d77d9ea01c7 (diff) | |
parent | 41ad36623fabe7d02c9f89aff077dd4c8ba5d602 (diff) |
Merge tag 'amd-drm-next-5.16-2021-10-22' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-5.16-2021-10-22:
amdgpu:
- PSP fix for resume
- XGMI fixes
- Interrupt fix in device tear down
- Renoir USB-C DP alt mode fix for resume
- DP 2.0 fixes
- Yellow Carp display fixes
- Misc display fixes
- RAS fixes
- IP Discovery enumeration fixes
- VGH fixes
- SR-IOV fixes
- Revert ChromeOS workaround in display code
- Cyan Skillfish fixes
amdkfd:
- Fix error handling in gpu memory allocation
- Fix build warnings with some configs
- SVM fixes
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211022183112.4574-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 33 |
1 files changed, 21 insertions, 12 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e8875351967e..08133de21fdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); #ifdef CONFIG_X86_MCE_AMD -static void amdgpu_register_bad_pages_mca_notifier(void); +static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); +struct mce_notifier_adev_list { + struct amdgpu_device *devs[MAX_GPU_INSTANCE]; + int num_gpu; +}; +static struct mce_notifier_adev_list mce_adev_list; #endif void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) @@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) #ifdef CONFIG_X86_MCE_AMD if ((adev->asic_type == CHIP_ALDEBARAN) && (adev->gmc.xgmi.connected_to_cpu)) - amdgpu_register_bad_pages_mca_notifier(); + amdgpu_register_bad_pages_mca_notifier(adev); #endif return 0; @@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) #ifdef CONFIG_X86_MCE_AMD static struct amdgpu_device *find_adev(uint32_t node_id) { - struct amdgpu_gpu_instance *gpu_instance; int i; struct amdgpu_device *adev = NULL; - mutex_lock(&mgpu_info.mutex); - - for (i = 0; i < mgpu_info.num_gpu; i++) { - gpu_instance = &(mgpu_info.gpu_ins[i]); - adev = gpu_instance->adev; + for (i = 0; i < mce_adev_list.num_gpu; i++) { + adev = mce_adev_list.devs[i]; - if (adev->gmc.xgmi.connected_to_cpu && + if (adev && adev->gmc.xgmi.connected_to_cpu && adev->gmc.xgmi.physical_node_id == node_id) break; adev = NULL; } - mutex_unlock(&mgpu_info.mutex); - return adev; } @@ -2718,9 +2717,19 @@ static struct notifier_block amdgpu_bad_page_nb = { .priority = MCE_PRIO_UC, }; -static void amdgpu_register_bad_pages_mca_notifier(void) +static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) { /* + * Add the adev to the mce_adev_list. + * During mode2 reset, amdgpu device is temporarily + * removed from the mgpu_info list which can cause + * page retirement to fail. + * Use this list instead of mgpu_info to find the amdgpu + * device on which the UMC error was reported. + */ + mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; + + /* * Register the x86 notifier only once * with MCE subsystem. */ |