From ab29ac57ad0b3ab0be7c7635e585651da9f2cd2c Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Thu, 12 Oct 2023 17:48:40 +0800 Subject: drm/amdgpu/umsch: add suspend and resume callback Add missing IP callbacks. Signed-off-by: Lang Yu Acked-by: Alex Deucher Reviewed-by: Veerabadhran Gopalakrishnan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c index 1c523eb5f342..ca45ba8ac171 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c @@ -843,6 +843,20 @@ static int umsch_mm_hw_fini(void *handle) return 0; } +static int umsch_mm_suspend(void *handle) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)handle; + + return umsch_mm_hw_fini(adev); +} + +static int umsch_mm_resume(void *handle) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)handle; + + return umsch_mm_hw_init(adev); +} + static const struct amd_ip_funcs umsch_mm_v4_0_ip_funcs = { .name = "umsch_mm_v4_0", .early_init = umsch_mm_early_init, @@ -851,6 +865,8 @@ static const struct amd_ip_funcs umsch_mm_v4_0_ip_funcs = { .sw_fini = umsch_mm_sw_fini, .hw_init = umsch_mm_hw_init, .hw_fini = umsch_mm_hw_fini, + .suspend = umsch_mm_suspend, + .resume = umsch_mm_resume, }; const struct amdgpu_ip_block_version umsch_mm_v4_0_ip_block = { -- cgit From 2d955a06a5db7388d177fe0d3ce638e7d7b90a16 Mon Sep 17 00:00:00 2001 From: Mangesh Gadre Date: Wed, 11 Oct 2023 16:33:17 +0800 Subject: Revert "drm/amdgpu: Program xcp_ctl registers as needed" This reverts commit 0bdebfef3fb2b6291000765eaa9c6c8030293fce. XCP_CTL register is programmed by firmware and register access is protected. Signed-off-by: Mangesh Gadre Reviewed-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index db179d085efa..a1c2c952d882 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -623,7 +623,7 @@ static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev, int num_xccs_per_xcp) { int ret, i, num_xcc; - u32 tmp = 0, regval; + u32 tmp = 0; if (adev->psp.funcs) { ret = psp_spatial_partition(&adev->psp, @@ -631,24 +631,23 @@ static int gfx_v9_4_3_switch_compute_partition(struct amdgpu_device *adev, num_xccs_per_xcp); if (ret) return ret; - } - - num_xcc = NUM_XCC(adev->gfx.xcc_mask); + } else { + num_xcc = NUM_XCC(adev->gfx.xcc_mask); - for (i = 0; i < num_xcc; i++) { - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP, - num_xccs_per_xcp); - tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID, - i % num_xccs_per_xcp); - regval = RREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL); - if (regval != tmp) + for (i = 0; i < num_xcc; i++) { + tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, NUM_XCC_IN_XCP, + num_xccs_per_xcp); + tmp = REG_SET_FIELD(tmp, CP_HYP_XCP_CTL, VIRTUAL_XCC_ID, + i % num_xccs_per_xcp); WREG32_SOC15(GC, GET_INST(GC, i), regCP_HYP_XCP_CTL, tmp); + } + ret = 0; } adev->gfx.num_xcc_per_xcp = num_xccs_per_xcp; - return 0; + return ret; } static int gfx_v9_4_3_ih_to_xcc_inst(struct amdgpu_device *adev, int ih_node) -- cgit From 53dd920c1f471a5763c660a7b94fe0aaf746d357 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Thu, 5 Oct 2023 15:40:42 +0800 Subject: drm/amdgpu : Add hive ras recovery check If one of the devices in the hive detects a fatal error, need to send ras recovery reset message to PMFW of all devices in the hive. For that add a flag in hive to indicate that it's undergoing ras recovery Signed-off-by: Asad Kamal Reviewed-by: Hawking Zhang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 973073e07b2a..c7f8dcb3b4d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2140,9 +2140,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (hive) + atomic_set(&hive->ras_recovery, 1); if (!ras->disable_ras_err_cnt_harvest) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -2159,7 +2161,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_ras_log_on_err_counter(remote_adev); } - amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) { @@ -2194,6 +2195,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); + if (hive) { + atomic_set(&hive->ras_recovery, 0); + amdgpu_put_xgmi_hive(hive); + } } /* alloc/realloc bps array */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 86fbf56938f4..6cab882e8061 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -44,6 +44,7 @@ struct amdgpu_hive_info { struct amdgpu_reset_domain *reset_domain; uint32_t device_remove_count; + atomic_t ras_recovery; }; struct amdgpu_pcs_ras_field { -- cgit From 28ab9a02b6cf3323c677e75045141d1d24631385 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 11 Oct 2023 09:49:14 -0400 Subject: drm/amdgpu/mes11: remove aggregated doorbell code It's not enabled in hardware so the code is dead. Remove it. Reviewed-by: Jack Xiao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 86 ++++++---------------------------- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 56 ---------------------- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 83 ++++++++++---------------------- 3 files changed, 40 insertions(+), 185 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 27b224b0688a..91c07ab4f14e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5170,45 +5170,17 @@ static u64 gfx_v11_0_ring_get_wptr_gfx(struct amdgpu_ring *ring) static void gfx_v11_0_ring_set_wptr_gfx(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - uint32_t *wptr_saved; - uint32_t *is_queue_unmap; - uint64_t aggregated_db_index; - uint32_t mqd_size = adev->mqds[AMDGPU_HW_IP_GFX].mqd_size; - uint64_t wptr_tmp; - if (ring->is_mes_queue) { - wptr_saved = (uint32_t *)(ring->mqd_ptr + mqd_size); - is_queue_unmap = (uint32_t *)(ring->mqd_ptr + mqd_size + - sizeof(uint32_t)); - aggregated_db_index = - amdgpu_mes_get_aggregated_doorbell_index(adev, - ring->hw_prio); - - wptr_tmp = ring->wptr & ring->buf_mask; - atomic64_set((atomic64_t *)ring->wptr_cpu_addr, wptr_tmp); - *wptr_saved = wptr_tmp; - /* assume doorbell always being used by mes mapped queue */ - if (*is_queue_unmap) { - WDOORBELL64(aggregated_db_index, wptr_tmp); - WDOORBELL64(ring->doorbell_index, wptr_tmp); - } else { - WDOORBELL64(ring->doorbell_index, wptr_tmp); - - if (*is_queue_unmap) - WDOORBELL64(aggregated_db_index, wptr_tmp); - } + if (ring->use_doorbell) { + /* XXX check if swapping is necessary on BE */ + atomic64_set((atomic64_t *)ring->wptr_cpu_addr, + ring->wptr); + WDOORBELL64(ring->doorbell_index, ring->wptr); } else { - if (ring->use_doorbell) { - /* XXX check if swapping is necessary on BE */ - atomic64_set((atomic64_t *)ring->wptr_cpu_addr, - ring->wptr); - WDOORBELL64(ring->doorbell_index, ring->wptr); - } else { - WREG32_SOC15(GC, 0, regCP_RB0_WPTR, - lower_32_bits(ring->wptr)); - WREG32_SOC15(GC, 0, regCP_RB0_WPTR_HI, - upper_32_bits(ring->wptr)); - } + WREG32_SOC15(GC, 0, regCP_RB0_WPTR, + lower_32_bits(ring->wptr)); + WREG32_SOC15(GC, 0, regCP_RB0_WPTR_HI, + upper_32_bits(ring->wptr)); } } @@ -5233,42 +5205,14 @@ static u64 gfx_v11_0_ring_get_wptr_compute(struct amdgpu_ring *ring) static void gfx_v11_0_ring_set_wptr_compute(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - uint32_t *wptr_saved; - uint32_t *is_queue_unmap; - uint64_t aggregated_db_index; - uint32_t mqd_size = adev->mqds[AMDGPU_HW_IP_COMPUTE].mqd_size; - uint64_t wptr_tmp; - if (ring->is_mes_queue) { - wptr_saved = (uint32_t *)(ring->mqd_ptr + mqd_size); - is_queue_unmap = (uint32_t *)(ring->mqd_ptr + mqd_size + - sizeof(uint32_t)); - aggregated_db_index = - amdgpu_mes_get_aggregated_doorbell_index(adev, - ring->hw_prio); - - wptr_tmp = ring->wptr & ring->buf_mask; - atomic64_set((atomic64_t *)ring->wptr_cpu_addr, wptr_tmp); - *wptr_saved = wptr_tmp; - /* assume doorbell always used by mes mapped queue */ - if (*is_queue_unmap) { - WDOORBELL64(aggregated_db_index, wptr_tmp); - WDOORBELL64(ring->doorbell_index, wptr_tmp); - } else { - WDOORBELL64(ring->doorbell_index, wptr_tmp); - - if (*is_queue_unmap) - WDOORBELL64(aggregated_db_index, wptr_tmp); - } + /* XXX check if swapping is necessary on BE */ + if (ring->use_doorbell) { + atomic64_set((atomic64_t *)ring->wptr_cpu_addr, + ring->wptr); + WDOORBELL64(ring->doorbell_index, ring->wptr); } else { - /* XXX check if swapping is necessary on BE */ - if (ring->use_doorbell) { - atomic64_set((atomic64_t *)ring->wptr_cpu_addr, - ring->wptr); - WDOORBELL64(ring->doorbell_index, ring->wptr); - } else { - BUG(); /* only DOORBELL method supported on gfx11 now */ - } + BUG(); /* only DOORBELL method supported on gfx11 now */ } } diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 31b26e6f0b30..4dfec56e1b7f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -414,60 +414,6 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); } -static void mes_v11_0_init_aggregated_doorbell(struct amdgpu_mes *mes) -{ - struct amdgpu_device *adev = mes->adev; - uint32_t data; - - data = RREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL1); - data &= ~(CP_MES_DOORBELL_CONTROL1__DOORBELL_OFFSET_MASK | - CP_MES_DOORBELL_CONTROL1__DOORBELL_EN_MASK | - CP_MES_DOORBELL_CONTROL1__DOORBELL_HIT_MASK); - data |= mes->aggregated_doorbells[AMDGPU_MES_PRIORITY_LEVEL_LOW] << - CP_MES_DOORBELL_CONTROL1__DOORBELL_OFFSET__SHIFT; - data |= 1 << CP_MES_DOORBELL_CONTROL1__DOORBELL_EN__SHIFT; - WREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL1, data); - - data = RREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL2); - data &= ~(CP_MES_DOORBELL_CONTROL2__DOORBELL_OFFSET_MASK | - CP_MES_DOORBELL_CONTROL2__DOORBELL_EN_MASK | - CP_MES_DOORBELL_CONTROL2__DOORBELL_HIT_MASK); - data |= mes->aggregated_doorbells[AMDGPU_MES_PRIORITY_LEVEL_NORMAL] << - CP_MES_DOORBELL_CONTROL2__DOORBELL_OFFSET__SHIFT; - data |= 1 << CP_MES_DOORBELL_CONTROL2__DOORBELL_EN__SHIFT; - WREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL2, data); - - data = RREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL3); - data &= ~(CP_MES_DOORBELL_CONTROL3__DOORBELL_OFFSET_MASK | - CP_MES_DOORBELL_CONTROL3__DOORBELL_EN_MASK | - CP_MES_DOORBELL_CONTROL3__DOORBELL_HIT_MASK); - data |= mes->aggregated_doorbells[AMDGPU_MES_PRIORITY_LEVEL_MEDIUM] << - CP_MES_DOORBELL_CONTROL3__DOORBELL_OFFSET__SHIFT; - data |= 1 << CP_MES_DOORBELL_CONTROL3__DOORBELL_EN__SHIFT; - WREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL3, data); - - data = RREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL4); - data &= ~(CP_MES_DOORBELL_CONTROL4__DOORBELL_OFFSET_MASK | - CP_MES_DOORBELL_CONTROL4__DOORBELL_EN_MASK | - CP_MES_DOORBELL_CONTROL4__DOORBELL_HIT_MASK); - data |= mes->aggregated_doorbells[AMDGPU_MES_PRIORITY_LEVEL_HIGH] << - CP_MES_DOORBELL_CONTROL4__DOORBELL_OFFSET__SHIFT; - data |= 1 << CP_MES_DOORBELL_CONTROL4__DOORBELL_EN__SHIFT; - WREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL4, data); - - data = RREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL5); - data &= ~(CP_MES_DOORBELL_CONTROL5__DOORBELL_OFFSET_MASK | - CP_MES_DOORBELL_CONTROL5__DOORBELL_EN_MASK | - CP_MES_DOORBELL_CONTROL5__DOORBELL_HIT_MASK); - data |= mes->aggregated_doorbells[AMDGPU_MES_PRIORITY_LEVEL_REALTIME] << - CP_MES_DOORBELL_CONTROL5__DOORBELL_OFFSET__SHIFT; - data |= 1 << CP_MES_DOORBELL_CONTROL5__DOORBELL_EN__SHIFT; - WREG32_SOC15(GC, 0, regCP_MES_DOORBELL_CONTROL5, data); - - data = 1 << CP_HQD_GFX_CONTROL__DB_UPDATED_MSG_EN__SHIFT; - WREG32_SOC15(GC, 0, regCP_HQD_GFX_CONTROL, data); -} - static const struct amdgpu_mes_funcs mes_v11_0_funcs = { .add_hw_queue = mes_v11_0_add_hw_queue, .remove_hw_queue = mes_v11_0_remove_hw_queue, @@ -1243,8 +1189,6 @@ static int mes_v11_0_hw_init(void *handle) if (r) goto failure; - mes_v11_0_init_aggregated_doorbell(&adev->mes); - r = mes_v11_0_query_sched_status(&adev->mes); if (r) { DRM_ERROR("MES is busy\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 1f8122fd1ad7..7e4d5188cbfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -156,68 +156,35 @@ static uint64_t sdma_v6_0_ring_get_wptr(struct amdgpu_ring *ring) static void sdma_v6_0_ring_set_wptr(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - uint32_t *wptr_saved; - uint32_t *is_queue_unmap; - uint64_t aggregated_db_index; - uint32_t mqd_size = adev->mqds[AMDGPU_HW_IP_DMA].mqd_size; - - DRM_DEBUG("Setting write pointer\n"); - - if (ring->is_mes_queue) { - wptr_saved = (uint32_t *)(ring->mqd_ptr + mqd_size); - is_queue_unmap = (uint32_t *)(ring->mqd_ptr + mqd_size + - sizeof(uint32_t)); - aggregated_db_index = - amdgpu_mes_get_aggregated_doorbell_index(adev, - ring->hw_prio); + if (ring->use_doorbell) { + DRM_DEBUG("Using doorbell -- " + "wptr_offs == 0x%08x " + "lower_32_bits(ring->wptr) << 2 == 0x%08x " + "upper_32_bits(ring->wptr) << 2 == 0x%08x\n", + ring->wptr_offs, + lower_32_bits(ring->wptr << 2), + upper_32_bits(ring->wptr << 2)); + /* XXX check if swapping is necessary on BE */ atomic64_set((atomic64_t *)ring->wptr_cpu_addr, ring->wptr << 2); - *wptr_saved = ring->wptr << 2; - if (*is_queue_unmap) { - WDOORBELL64(aggregated_db_index, ring->wptr << 2); - DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n", - ring->doorbell_index, ring->wptr << 2); - WDOORBELL64(ring->doorbell_index, ring->wptr << 2); - } else { - DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n", - ring->doorbell_index, ring->wptr << 2); - WDOORBELL64(ring->doorbell_index, ring->wptr << 2); - - if (*is_queue_unmap) - WDOORBELL64(aggregated_db_index, - ring->wptr << 2); - } + DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n", + ring->doorbell_index, ring->wptr << 2); + WDOORBELL64(ring->doorbell_index, ring->wptr << 2); } else { - if (ring->use_doorbell) { - DRM_DEBUG("Using doorbell -- " - "wptr_offs == 0x%08x " - "lower_32_bits(ring->wptr) << 2 == 0x%08x " - "upper_32_bits(ring->wptr) << 2 == 0x%08x\n", - ring->wptr_offs, - lower_32_bits(ring->wptr << 2), - upper_32_bits(ring->wptr << 2)); - /* XXX check if swapping is necessary on BE */ - atomic64_set((atomic64_t *)ring->wptr_cpu_addr, - ring->wptr << 2); - DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n", - ring->doorbell_index, ring->wptr << 2); - WDOORBELL64(ring->doorbell_index, ring->wptr << 2); - } else { - DRM_DEBUG("Not using doorbell -- " - "regSDMA%i_GFX_RB_WPTR == 0x%08x " - "regSDMA%i_GFX_RB_WPTR_HI == 0x%08x\n", - ring->me, - lower_32_bits(ring->wptr << 2), - ring->me, - upper_32_bits(ring->wptr << 2)); - WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, - ring->me, regSDMA0_QUEUE0_RB_WPTR), - lower_32_bits(ring->wptr << 2)); - WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, - ring->me, regSDMA0_QUEUE0_RB_WPTR_HI), - upper_32_bits(ring->wptr << 2)); - } + DRM_DEBUG("Not using doorbell -- " + "regSDMA%i_GFX_RB_WPTR == 0x%08x " + "regSDMA%i_GFX_RB_WPTR_HI == 0x%08x\n", + ring->me, + lower_32_bits(ring->wptr << 2), + ring->me, + upper_32_bits(ring->wptr << 2)); + WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, + ring->me, regSDMA0_QUEUE0_RB_WPTR), + lower_32_bits(ring->wptr << 2)); + WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, + ring->me, regSDMA0_QUEUE0_RB_WPTR_HI), + upper_32_bits(ring->wptr << 2)); } } -- cgit From d8c1925ba8cde2863297728a4c8fbf8fe766757a Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Mon, 16 Oct 2023 22:10:34 +0800 Subject: drm/amdgpu: update retry times for psp BL wait Increase retry time for PSP BL wait, to compensate for longer time to set c2pmsg 35 ready bit during mode1 with RAS Signed-off-by: Asad Kamal Reviewed-by: Hawking Zhang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 573046702861..4142e2fcd866 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -168,7 +168,7 @@ static int psp_v13_0_wait_for_bootloader(struct psp_context *psp) * If there is an error in processing command, bits[7:0] will be set. * This is applicable for PSP v13.0.6 and newer. */ - for (retry_loop = 0; retry_loop < 10; retry_loop++) { + for (retry_loop = 0; retry_loop < PSP_VMBX_POLLING_LIMIT; retry_loop++) { ret = psp_wait_for( psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_35), 0x80000000, 0xffffffff, false); -- cgit From d757dfd667aad54c6ed0b6f22a11ad5a317663de Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 13 Oct 2023 14:26:02 -0500 Subject: drm/amd: Move microcode init step to early_init() The intention for early init is to find any missing microcode early and fail the driver load if it's missing. Move this step to earlier in driver init to match other IP blocks. Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 91c07ab4f14e..f994508a36ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -589,6 +589,14 @@ static int gfx_v11_0_init_microcode(struct amdgpu_device *adev) adev->gfx.mec2_fw = NULL; gfx_v11_0_check_fw_cp_gfx_shadow(adev); + + if (adev->gfx.imu.funcs && adev->gfx.imu.funcs->init_microcode) { + err = adev->gfx.imu.funcs->init_microcode(adev); + if (err) + DRM_ERROR("Failed to init imu firmware!\n"); + return err; + } + out: if (err) { amdgpu_ucode_release(&adev->gfx.pfp_fw); @@ -1395,14 +1403,6 @@ static int gfx_v11_0_sw_init(void *handle) adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE; - if (adev->gfx.imu.funcs) { - if (adev->gfx.imu.funcs->init_microcode) { - r = adev->gfx.imu.funcs->init_microcode(adev); - if (r) - DRM_ERROR("Failed to load imu firmware!\n"); - } - } - gfx_v11_0_me_init(adev); r = gfx_v11_0_rlc_init(adev); -- cgit From 4916615fe96fb530517b0d46702c750c20a5601c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 13 Oct 2023 14:26:03 -0500 Subject: drm/amd: Don't parse IMU ucode version if it won't be loaded When the IMU ucode is loaded by the PSP parsing the version that comes from Linux will vary. Rather than showing the wrong data to kernel interface consumers, avoid populating it in this case. Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/imu_v11_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c index 875fb5ac70b5..c0bdab3bf0e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c @@ -55,7 +55,6 @@ static int imu_v11_0_init_microcode(struct amdgpu_device *adev) if (err) goto out; imu_hdr = (const struct imu_firmware_header_v1_0 *)adev->gfx.imu_fw->data; - adev->gfx.imu_fw_version = le32_to_cpu(imu_hdr->header.ucode_version); //adev->gfx.imu_feature_version = le32_to_cpu(imu_hdr->ucode_feature_version); if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { @@ -69,7 +68,8 @@ static int imu_v11_0_init_microcode(struct amdgpu_device *adev) info->fw = adev->gfx.imu_fw; adev->firmware.fw_size += ALIGN(le32_to_cpu(imu_hdr->imu_dram_ucode_size_bytes), PAGE_SIZE); - } + } else + adev->gfx.imu_fw_version = le32_to_cpu(imu_hdr->header.ucode_version); out: if (err) { -- cgit From e56690bb37eb202cfc31deb6b794dc8fca9b9a89 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 13 Oct 2023 14:26:04 -0500 Subject: drm/amd: Read IMU FW version from scratch register during hw_init If the IMU version wasn't discovered from the header, such as when the firmware was directly loaded by PSP then there is no firmware version to show to userspace from sysfs or IOCTL. The IMU F/W stores the version in the first scratch register though, so fetch it in these cases to let the driver export. Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index f994508a36ea..fd22943685f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -4373,6 +4373,10 @@ static int gfx_v11_0_hw_init(void *handle) if (r) return r; + /* get IMU version from HW if it's not set */ + if (!adev->gfx.imu_fw_version) + adev->gfx.imu_fw_version = RREG32_SOC15(GC, 0, regGFX_IMU_SCRATCH_0); + return r; } -- cgit From b1338a8e71acaf68892b390dee0271fe7323b64d Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Tue, 17 Oct 2023 21:49:09 +0800 Subject: drm/amdgpu: Workaround to skip kiq ring test during ras gpu recovery This is workaround, kiq ring test failed in suspend stage when do ras recovery. Signed-off-by: Stanley.Yang Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 9a158018ae16..c92e0aba69e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -29,6 +29,7 @@ #include "amdgpu_rlc.h" #include "amdgpu_ras.h" #include "amdgpu_xcp.h" +#include "amdgpu_xgmi.h" /* delay 0.1 second to enable gfx off feature */ #define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100) @@ -501,6 +502,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) { struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; struct amdgpu_ring *kiq_ring = &kiq->ring; + struct amdgpu_hive_info *hive; + struct amdgpu_ras *ras; + int hive_ras_recovery = 0; int i, r = 0; int j; @@ -521,6 +525,23 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) RESET_QUEUES, 0, 0); } + /** + * This is workaround: only skip kiq_ring test + * during ras recovery in suspend stage for gfx9.4.3 + */ + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + + ras = amdgpu_ras_get_context(adev); + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && + ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) { + spin_unlock(&kiq->ring_lock); + return 0; + } + if (kiq_ring->sched.ready && !adev->job_hang) r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); -- cgit From e6f8588733342c61948fde673a862b53c0d972bc Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 17 Oct 2023 16:51:03 -0400 Subject: drm/amdgpu: Fix possible null pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit abo->tbo.resource may be NULL in amdgpu_vm_bo_update. Fixes: 180253782038 ("drm/ttm: stop allocating dummy resources during BO creation") Signed-off-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index afc19341334f..f3c9f93d8899 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1090,7 +1090,8 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, struct drm_gem_object *gobj = dma_buf->priv; struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj); - if (abo->tbo.resource->mem_type == TTM_PL_VRAM) + if (abo->tbo.resource && + abo->tbo.resource->mem_type == TTM_PL_VRAM) bo = gem_to_amdgpu_bo(gobj); } mem = bo->tbo.resource; -- cgit From 207430b76a48b0b245bab08efe346148a5558df7 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Mon, 17 Jul 2023 15:28:52 -0400 Subject: drm/amdgpu: Reserve fences for VM update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In amdgpu_dma_buf_move_notify reserve fences for the page table updates in amdgpu_vm_clear_freed and amdgpu_vm_handle_moved. This fixes a BUG_ON in dma_resv_add_fence when using SDMA for page table updates. Signed-off-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 76b618735dc0..b5e28fa3f414 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -404,7 +404,10 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) continue; } - r = amdgpu_vm_clear_freed(adev, vm, NULL); + /* Reserve fences for two SDMA page table updates */ + r = dma_resv_reserve_fences(resv, 2); + if (!r) + r = amdgpu_vm_clear_freed(adev, vm, NULL); if (!r) r = amdgpu_vm_handle_moved(adev, vm); -- cgit From afcf949cf331de791e3fbfc65c0bb82dd9df6d57 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Wed, 18 Oct 2023 15:16:39 +0800 Subject: drm/amdgpu: Log UE corrected by replay as correctable error Support replay mode where UE could be converted to CE. Signed-off-by: Candice Li Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index b664ee3ee92d..025e6aeb058d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -105,7 +105,9 @@ static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev, RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1 || + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 && + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 0))) *error_count += 1; } @@ -125,7 +127,6 @@ static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) @@ -293,7 +294,7 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev, /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && - REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) { + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) { mc_umc_addrt0 = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); -- cgit From 472c5fb29798695b589fb844f84c6bf4ff07c592 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Wed, 11 Oct 2023 18:36:15 +0800 Subject: drm/amdgpu: define ras_reset_error_count function Make the code architecture more simple. v2: reuse ras_reset_error_count in ras_reset_error_status. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 +++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c7f8dcb3b4d2..a4384f0eb8bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1170,23 +1170,34 @@ out_fini_err_data: return ret; } -int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", - ras_block_str(block)); - return 0; + ras_block_str(block)); + return -EOPNOTSUPP; } if (!amdgpu_ras_is_supported(adev, block)) - return 0; + return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) block_obj->hw_ops->reset_ras_error_count(adev); + return 0; +} + +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + + if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) + return 0; + if ((block == AMDGPU_RAS_BLOCK__GFX) || (block == AMDGPU_RAS_BLOCK__MMHUB)) { if (block_obj->hw_ops->reset_ras_error_status) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 0a5c8a107fb2..3f9ac0ab67e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -714,6 +714,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info); +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block); int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block); -- cgit From 9248462d7e0862883df6741ec0e1bb41c3698b22 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 28 Sep 2023 17:00:31 +0800 Subject: drm/amdgpu: Enable software RAS in vcn v4_0_3 Set VCN/JPEG RAS masks to enable software RAS for VCN and JPEG. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a4384f0eb8bf..0df15d856085 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2622,7 +2622,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || amdgpu_ip_version(adev, VCN_HWIP, 0) == - IP_VERSION(4, 0, 0)) + IP_VERSION(4, 0, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == + IP_VERSION(4, 0, 3)) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 1 << AMDGPU_RAS_BLOCK__JPEG); else -- cgit From f2176d70638aaa1fa2a1c3068f0acedcb271a8aa Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 28 Sep 2023 17:05:59 +0800 Subject: drm/amdgpu: Add UVD_VCPU_INT_EN2 to dpg sram Add RAS sepcifc programming to dpg sram. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index f85d18cd74ec..810bbfccd6f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -1760,6 +1760,11 @@ static void vcn_v4_0_3_enable_ras(struct amdgpu_device *adev, SOC15_DPG_MODE_OFFSET(VCN, 0, regVCN_RAS_CNTL), tmp, 0, indirect); + tmp = UVD_VCPU_INT_EN2__RASCNTL_VCPU_VCODEC_EN_MASK; + WREG32_SOC15_DPG_MODE(inst_idx, + SOC15_DPG_MODE_OFFSET(VCN, 0, regUVD_VCPU_INT_EN2), + tmp, 0, indirect); + tmp = UVD_SYS_INT_EN__RASCNTL_VCPU_VCODEC_EN_MASK; WREG32_SOC15_DPG_MODE(inst_idx, SOC15_DPG_MODE_OFFSET(VCN, 0, regUVD_SYS_INT_EN), -- cgit From 8a65661114941788a2093193c251e44cf1d6439c Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 19 Oct 2023 14:55:57 +0800 Subject: drm/amdgpu: Fix delete nodes that have been relesed Fix delete nodes that it has been freed. Fixes: 5b1270beb380 ("drm/amdgpu: add ras_err_info to identify RAS error source") Signed-off-by: Stanley.Yang Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0df15d856085..b3c32c667e09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3503,10 +3503,8 @@ void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) { struct ras_err_node *err_node, *tmp; - list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) { + list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) amdgpu_ras_error_node_release(err_node); - list_del(&err_node->node); - } } static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, -- cgit From 97b2821643f776c94ebcea79052f77e732d56f6d Mon Sep 17 00:00:00 2001 From: Bokun Zhang Date: Mon, 16 Oct 2023 12:48:02 -0400 Subject: drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P1 - Update SRIOV header with RB decouple flag Signed-off-by: Bokun Zhang Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index 104a5ad8397d..51a14f6d93bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -90,10 +90,11 @@ union amd_sriov_msg_feature_flags { uint32_t host_load_ucodes : 1; uint32_t host_flr_vramlost : 1; uint32_t mm_bw_management : 1; - uint32_t pp_one_vf_mode : 1; + uint32_t pp_one_vf_mode : 1; uint32_t reg_indirect_acc : 1; uint32_t av1_support : 1; - uint32_t reserved : 25; + uint32_t vcn_rb_decouple : 1; + uint32_t reserved : 24; } flags; uint32_t all; }; -- cgit From fc3136730ba3e606b1c892e041f0b8356bda5457 Mon Sep 17 00:00:00 2001 From: Bokun Zhang Date: Mon, 16 Oct 2023 12:49:11 -0400 Subject: drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P2 - Add function to check if RB decouple is enabled under SRIOV Signed-off-by: Bokun Zhang Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index fabb83e9d9ae..858ef21ae515 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -126,6 +126,8 @@ enum AMDGIM_FEATURE_FLAG { AMDGIM_FEATURE_INDIRECT_REG_ACCESS = (1 << 5), /* AV1 Support MODE*/ AMDGIM_FEATURE_AV1_SUPPORT = (1 << 6), + /* VCN RB decouple */ + AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7), }; enum AMDGIM_REG_ACCESS_FLAG { @@ -326,6 +328,8 @@ static inline bool is_virtual_machine(void) ((!amdgpu_in_reset(adev)) && (!adev->virt.tdr_debug)) #define amdgpu_sriov_is_av1_support(adev) \ ((adev)->virt.gim_feature & AMDGIM_FEATURE_AV1_SUPPORT) +#define amdgpu_sriov_is_vcn_rb_decouple(adev) \ + ((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE) bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev); void amdgpu_virt_init_setting(struct amdgpu_device *adev); void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev, -- cgit From eb9d6256b9b072b29193a3a051b2f7e76e0fd0de Mon Sep 17 00:00:00 2001 From: Bokun Zhang Date: Mon, 16 Oct 2023 12:49:58 -0400 Subject: drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P3 - Update VCN header for RB decouple feature - Add metadata struct, metadata will be placed after each RB Signed-off-by: Bokun Zhang Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 48 ++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 0815c5a97564..514c98ea144f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -169,6 +169,9 @@ #define AMDGPU_VCN_SMU_VERSION_INFO_FLAG (1 << 11) #define AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG (1 << 11) #define AMDGPU_VCN_VF_RB_SETUP_FLAG (1 << 14) +#define AMDGPU_VCN_VF_RB_DECOUPLE_FLAG (1 << 15) + +#define MAX_NUM_VCN_RB_SETUP 4 #define AMDGPU_VCN_IB_FLAG_DECODE_BUFFER 0x00000001 #define AMDGPU_VCN_CMD_FLAG_MSG_BUFFER 0x00000001 @@ -335,15 +338,30 @@ struct amdgpu_fw_shared { struct amdgpu_fw_shared_smu_interface_info smu_interface_info; }; +struct amdgpu_vcn_rb_setup_info { + uint32_t rb_addr_lo; + uint32_t rb_addr_hi; + uint32_t rb_size; +}; + struct amdgpu_fw_shared_rb_setup { uint32_t is_rb_enabled_flags; - uint32_t rb_addr_lo; - uint32_t rb_addr_hi; - uint32_t rb_size; - uint32_t rb4_addr_lo; - uint32_t rb4_addr_hi; - uint32_t rb4_size; - uint32_t reserved[6]; + + union { + struct { + uint32_t rb_addr_lo; + uint32_t rb_addr_hi; + uint32_t rb_size; + uint32_t rb4_addr_lo; + uint32_t rb4_addr_hi; + uint32_t rb4_size; + uint32_t reserved[6]; + }; + + struct { + struct amdgpu_vcn_rb_setup_info rb_info[MAX_NUM_VCN_RB_SETUP]; + }; + }; }; struct amdgpu_fw_shared_drm_key_wa { @@ -351,6 +369,11 @@ struct amdgpu_fw_shared_drm_key_wa { uint8_t reserved[3]; }; +struct amdgpu_fw_shared_queue_decouple { + uint8_t is_enabled; + uint8_t reserved[7]; +}; + struct amdgpu_vcn4_fw_shared { uint32_t present_flag_0; uint8_t pad[12]; @@ -361,6 +384,8 @@ struct amdgpu_vcn4_fw_shared { struct amdgpu_fw_shared_rb_setup rb_setup; struct amdgpu_fw_shared_smu_interface_info smu_dpm_interface; struct amdgpu_fw_shared_drm_key_wa drm_key_wa; + uint8_t pad3[9]; + struct amdgpu_fw_shared_queue_decouple decouple; }; struct amdgpu_vcn_fwlog { @@ -378,6 +403,15 @@ struct amdgpu_vcn_decode_buffer { uint32_t pad[30]; }; +struct amdgpu_vcn_rb_metadata { + uint32_t size; + uint32_t present_flag_0; + + uint8_t version; + uint8_t ring_id; + uint8_t pad[26]; +}; + #define VCN_BLOCK_ENCODE_DISABLE_MASK 0x80 #define VCN_BLOCK_DECODE_DISABLE_MASK 0x40 #define VCN_BLOCK_QUEUE_DISABLE_MASK 0xC0 -- cgit From 017634a68dab9c2ebdcd51b495ef6e53b95280cd Mon Sep 17 00:00:00 2001 From: Bokun Zhang Date: Mon, 16 Oct 2023 12:51:00 -0400 Subject: drm/amd/amdgpu/vcn: Add RB decouple feature under SRIOV - P4 - In VCN 4 SRIOV code path, add code to enable RB decouple feature Signed-off-by: Bokun Zhang Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 71 +++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 7ab9263d3e19..48bfcd0d558b 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -176,9 +176,6 @@ static int vcn_v4_0_sw_init(void *handle) AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSHAKING; } - if (amdgpu_sriov_vf(adev)) - fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_SETUP_FLAG); - if (amdgpu_vcnfw_log) amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); } @@ -1209,6 +1206,24 @@ static int vcn_v4_0_start(struct amdgpu_device *adev) return 0; } +static int vcn_v4_0_init_ring_metadata(struct amdgpu_device *adev, uint32_t vcn_inst, struct amdgpu_ring *ring_enc) +{ + struct amdgpu_vcn_rb_metadata *rb_metadata = NULL; + uint8_t *rb_ptr = (uint8_t *)ring_enc->ring; + + rb_ptr += ring_enc->ring_size; + rb_metadata = (struct amdgpu_vcn_rb_metadata *)rb_ptr; + + memset(rb_metadata, 0, sizeof(struct amdgpu_vcn_rb_metadata)); + rb_metadata->size = sizeof(struct amdgpu_vcn_rb_metadata); + rb_metadata->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_SETUP_FLAG); + rb_metadata->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_DECOUPLE_FLAG); + rb_metadata->version = 1; + rb_metadata->ring_id = vcn_inst & 0xFF; + + return 0; +} + static int vcn_v4_0_start_sriov(struct amdgpu_device *adev) { int i; @@ -1331,11 +1346,30 @@ static int vcn_v4_0_start_sriov(struct amdgpu_device *adev) rb_enc_addr = ring_enc->gpu_addr; rb_setup->is_rb_enabled_flags |= RB_ENABLED; - rb_setup->rb_addr_lo = lower_32_bits(rb_enc_addr); - rb_setup->rb_addr_hi = upper_32_bits(rb_enc_addr); - rb_setup->rb_size = ring_enc->ring_size / 4; fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_SETUP_FLAG); + if (amdgpu_sriov_is_vcn_rb_decouple(adev)) { + vcn_v4_0_init_ring_metadata(adev, i, ring_enc); + + memset((void *)&rb_setup->rb_info, 0, sizeof(struct amdgpu_vcn_rb_setup_info) * MAX_NUM_VCN_RB_SETUP); + if (!(adev->vcn.harvest_config & (1 << 0))) { + rb_setup->rb_info[0].rb_addr_lo = lower_32_bits(adev->vcn.inst[0].ring_enc[0].gpu_addr); + rb_setup->rb_info[0].rb_addr_hi = upper_32_bits(adev->vcn.inst[0].ring_enc[0].gpu_addr); + rb_setup->rb_info[0].rb_size = adev->vcn.inst[0].ring_enc[0].ring_size / 4; + } + if (!(adev->vcn.harvest_config & (1 << 1))) { + rb_setup->rb_info[2].rb_addr_lo = lower_32_bits(adev->vcn.inst[1].ring_enc[0].gpu_addr); + rb_setup->rb_info[2].rb_addr_hi = upper_32_bits(adev->vcn.inst[1].ring_enc[0].gpu_addr); + rb_setup->rb_info[2].rb_size = adev->vcn.inst[1].ring_enc[0].ring_size / 4; + } + fw_shared->decouple.is_enabled = 1; + fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_VF_RB_DECOUPLE_FLAG); + } else { + rb_setup->rb_addr_lo = lower_32_bits(rb_enc_addr); + rb_setup->rb_addr_hi = upper_32_bits(rb_enc_addr); + rb_setup->rb_size = ring_enc->ring_size / 4; + } + MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i, regUVD_LMI_VCPU_NC0_64BIT_BAR_LOW), lower_32_bits(adev->vcn.inst[i].fw_shared.gpu_addr)); @@ -1807,6 +1841,7 @@ static struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = { .type = AMDGPU_RING_TYPE_VCN_ENC, .align_mask = 0x3f, .nop = VCN_ENC_CMD_NO_OP, + .extra_dw = sizeof(struct amdgpu_vcn_rb_metadata), .get_rptr = vcn_v4_0_unified_ring_get_rptr, .get_wptr = vcn_v4_0_unified_ring_get_wptr, .set_wptr = vcn_v4_0_unified_ring_set_wptr, @@ -2020,16 +2055,20 @@ static int vcn_v4_0_process_interrupt(struct amdgpu_device *adev, struct amdgpu_ { uint32_t ip_instance; - switch (entry->client_id) { - case SOC15_IH_CLIENTID_VCN: - ip_instance = 0; - break; - case SOC15_IH_CLIENTID_VCN1: - ip_instance = 1; - break; - default: - DRM_ERROR("Unhandled client id: %d\n", entry->client_id); - return 0; + if (amdgpu_sriov_is_vcn_rb_decouple(adev)) { + ip_instance = entry->ring_id; + } else { + switch (entry->client_id) { + case SOC15_IH_CLIENTID_VCN: + ip_instance = 0; + break; + case SOC15_IH_CLIENTID_VCN1: + ip_instance = 1; + break; + default: + DRM_ERROR("Unhandled client id: %d\n", entry->client_id); + return 0; + } } DRM_DEBUG("IH: VCN TRAP\n"); -- cgit From 49c260bef3ac9fc1bb73acf98036dac64712536d Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 19 Oct 2023 14:59:19 +0800 Subject: drm/amdgpu: fix typo for amdgpu ras error data print typo fix. Fixes: 5b1270beb380 ("drm/amdgpu: add ras_err_info to identify RAS error source") Signed-off-by: Yang Wang Reviewed-by: Candice Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b3c32c667e09..70dd249f2ba7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1038,7 +1038,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, ras_mgr->err_data.ue_count, blk_name); else dev_info(adev->dev, "%ld correctable hardware errors detected in %s block\n", - ras_mgr->err_data.ue_count, blk_name); + ras_mgr->err_data.ce_count, blk_name); for_each_ras_error(err_node, err_data) { err_info = &err_node->err_info; @@ -1055,7 +1055,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, "%lld correctable hardware errors detected in %s block\n", mcm_info->socket_id, mcm_info->die_id, - err_info->ue_count, + err_info->ce_count, blk_name); } } -- cgit From 66d64e4e03ef5ecf330075a5f1fc449549ce374a Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 19 Oct 2023 22:49:32 +0800 Subject: drm/amdgpu: Enable RAS feature by default for APU Enable RAS feature by default for aqua vanjaram on apu platform. Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 70dd249f2ba7..eb81efacefe2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2653,18 +2653,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) /* hw_supported needs to be aligned with RAS block mask. */ adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; - - /* - * Disable ras feature for aqua vanjaram - * by default on apu platform. - */ - if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) && - adev->gmc.is_app_apu) - adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : - adev->ras_hw_enabled & amdgpu_ras_mask; - else - adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : - adev->ras_hw_enabled & amdgpu_ras_mask; + adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_enabled & amdgpu_ras_mask; } static void amdgpu_ras_counte_dw(struct work_struct *work) -- cgit From fa9dd7a285efbcf81dc0fc5a75bd9341e017c80f Mon Sep 17 00:00:00 2001 From: Li Ma Date: Mon, 16 Oct 2023 14:36:26 +0800 Subject: drm/amdgpu: fix missing stuff in NBIO v7.11 add get_clockgating_state, update_medium_grain_light_sleep and update_medium_grain_clock_gating in nbio_v7_11_funcs v1: add missing funcs in nbio_v7_11.c v2: modify the if condition and add spport for nbio v7.11 clockgating. Signed-off-by: Li Ma Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c | 78 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/soc21.c | 1 + 2 files changed, 79 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c index 3a94f249929e..676ab1d20d2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c @@ -272,6 +272,81 @@ static void nbio_v7_11_init_registers(struct amdgpu_device *adev) */ } +static void nbio_v7_11_update_medium_grain_clock_gating(struct amdgpu_device *adev, + bool enable) +{ + uint32_t def, data; + + if (!(adev->cg_flags & AMD_CG_SUPPORT_BIF_MGCG)) + return; + + def = data = RREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL); + if (enable) { + data |= (BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__LCLK_DYN_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_DYN_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_LCNT_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_REGS_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_PRBS_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__REFCLK_REGS_GATE_ENABLE_MASK); + } else { + data &= ~(BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__LCLK_DYN_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_DYN_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_LCNT_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_REGS_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__TXCLK_PRBS_GATE_ENABLE_MASK | + BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__REFCLK_REGS_GATE_ENABLE_MASK); + } + + if (def != data) + WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL, data); +} + +static void nbio_v7_11_update_medium_grain_light_sleep(struct amdgpu_device *adev, + bool enable) +{ + uint32_t def, data; + + if (!(adev->cg_flags & AMD_CG_SUPPORT_BIF_LS)) + return; + + def = data = RREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_CNTL2); + if (enable) + data |= BIF_BIF256_CI256_RC3X4_USB4_PCIE_CNTL2__SLV_MEM_LS_EN_MASK; + else + data &= ~BIF_BIF256_CI256_RC3X4_USB4_PCIE_CNTL2__SLV_MEM_LS_EN_MASK; + + if (def != data) + WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_CNTL2, data); + + def = data = RREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_TX_POWER_CTRL_1); + if (enable) { + data |= (BIF_BIF256_CI256_RC3X4_USB4_PCIE_TX_POWER_CTRL_1__MST_MEM_LS_EN_MASK | + BIF_BIF256_CI256_RC3X4_USB4_PCIE_TX_POWER_CTRL_1__REPLAY_MEM_LS_EN_MASK); + } else { + data &= ~(BIF_BIF256_CI256_RC3X4_USB4_PCIE_TX_POWER_CTRL_1__MST_MEM_LS_EN_MASK | + BIF_BIF256_CI256_RC3X4_USB4_PCIE_TX_POWER_CTRL_1__REPLAY_MEM_LS_EN_MASK); + } + + if (def != data) + WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_TX_POWER_CTRL_1, data); +} + +static void nbio_v7_11_get_clockgating_state(struct amdgpu_device *adev, + u64 *flags) +{ + uint32_t data; + + /* AMD_CG_SUPPORT_BIF_MGCG */ + data = RREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL); + if (data & BIF_BIF256_CI256_RC3X4_USB4_CPM_CONTROL__LCLK_DYN_GATE_ENABLE_MASK) + *flags |= AMD_CG_SUPPORT_BIF_MGCG; + + /* AMD_CG_SUPPORT_BIF_LS */ + data = RREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_CNTL2); + if (data & BIF_BIF256_CI256_RC3X4_USB4_PCIE_CNTL2__SLV_MEM_LS_EN_MASK) + *flags |= AMD_CG_SUPPORT_BIF_LS; +} + const struct amdgpu_nbio_funcs nbio_v7_11_funcs = { .get_hdp_flush_req_offset = nbio_v7_11_get_hdp_flush_req_offset, .get_hdp_flush_done_offset = nbio_v7_11_get_hdp_flush_done_offset, @@ -288,6 +363,9 @@ const struct amdgpu_nbio_funcs nbio_v7_11_funcs = { .enable_doorbell_aperture = nbio_v7_11_enable_doorbell_aperture, .enable_doorbell_selfring_aperture = nbio_v7_11_enable_doorbell_selfring_aperture, .ih_doorbell_range = nbio_v7_11_ih_doorbell_range, + .update_medium_grain_clock_gating = nbio_v7_11_update_medium_grain_clock_gating, + .update_medium_grain_light_sleep = nbio_v7_11_update_medium_grain_light_sleep, + .get_clockgating_state = nbio_v7_11_get_clockgating_state, .ih_control = nbio_v7_11_ih_control, .init_registers = nbio_v7_11_init_registers, .remap_hdp_registers = nbio_v7_11_remap_hdp_registers, diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index df7462cec6ab..7fe199560264 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -863,6 +863,7 @@ static int soc21_common_set_clockgating_state(void *handle, case IP_VERSION(4, 3, 0): case IP_VERSION(4, 3, 1): case IP_VERSION(7, 7, 0): + case IP_VERSION(7, 11, 0): adev->nbio.funcs->update_medium_grain_clock_gating(adev, state == AMD_CG_STATE_GATE); adev->nbio.funcs->update_medium_grain_light_sleep(adev, -- cgit From 9d7a965e22e5c0abd1aa6aaa389a81de58ca5182 Mon Sep 17 00:00:00 2001 From: Li Ma Date: Wed, 18 Oct 2023 13:32:39 +0800 Subject: drm/amdgpu: add clockgating support for NBIO v7.7.1 add clockgating support for NBIO ip 7.7.1 Signed-off-by: Li Ma Reviewed-by: Tim Huang Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/soc21.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 7fe199560264..8c6cab641a1c 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -863,6 +863,7 @@ static int soc21_common_set_clockgating_state(void *handle, case IP_VERSION(4, 3, 0): case IP_VERSION(4, 3, 1): case IP_VERSION(7, 7, 0): + case IP_VERSION(7, 7, 1): case IP_VERSION(7, 11, 0): adev->nbio.funcs->update_medium_grain_clock_gating(adev, state == AMD_CG_STATE_GATE); -- cgit From 21226f02d77b6a1efcf987df8d97b2a4f40087bd Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Wed, 18 Oct 2023 17:57:25 +0800 Subject: drm/amdgpu: replace reset_error_count with amdgpu_ras_reset_error_count Simplify the code. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++------ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 9 ++------- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 7 ++----- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 7 ++----- 5 files changed, 10 insertions(+), 25 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0f98f720d9ca..2d385476939b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3578,9 +3578,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); } else { task_barrier_full(&hive->tb); @@ -5201,9 +5199,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); + amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); } amdgpu_ras_intr_cleared(); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 19aa6b7b16fb..9d5d742ee9d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -908,7 +908,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm adev->gmc.xgmi.num_physical_nodes == 0) return 0; - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); return amdgpu_ras_block_late_init(adev, ras_block); } @@ -1075,7 +1075,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } - adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 3a1050344b59..5fed01e34928 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1587,13 +1587,8 @@ static int gmc_v9_0_late_init(void *handle) } if (!amdgpu_persistent_edc_harvesting_supported(adev)) { - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); - - if (adev->hdp.ras && adev->hdp.ras->ras_block.hw_ops && - adev->hdp.ras->ras_block.hw_ops->reset_ras_error_count) - adev->hdp.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__HDP); } r = amdgpu_gmc_ras_late_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index dff66e1ae7ea..683d51ae4bf1 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1749,11 +1749,8 @@ static int sdma_v4_0_late_init(void *handle) sdma_v4_0_setup_ulv(adev); - if (!amdgpu_persistent_edc_harvesting_supported(adev)) { - if (adev->sdma.ras && adev->sdma.ras->ras_block.hw_ops && - adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count) - adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count(adev); - } + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 31aa245552d6..c46bc6aa4f48 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1276,11 +1276,8 @@ static int sdma_v4_4_2_late_init(void *handle) .cb = sdma_v4_4_2_process_ras_data_cb, }; #endif - if (!amdgpu_persistent_edc_harvesting_supported(adev)) { - if (adev->sdma.ras && adev->sdma.ras->ras_block.hw_ops && - adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count) - adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count(adev); - } + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA); return 0; } -- cgit From 8096df766474b54758b268afe900ba9d7ab0cc37 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 12 Oct 2023 11:22:07 +0800 Subject: drm/amdgpu: add set/get mca debug mode operations Record the debug mode status in RAS. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 +++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++++ 2 files changed, 26 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index eb81efacefe2..3c83a2b8fb2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3311,6 +3311,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (con) + con->is_mca_debug_mode = enable; +} + +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!con) + return false; + + if (mca_funcs && mca_funcs->mca_set_debug_mode) + return con->is_mca_debug_mode; + else + return true; +} /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 3f9ac0ab67e6..2fdfef62ee27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -434,6 +434,8 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + /* Record status of smu mca debug mode */ + bool is_mca_debug_mode; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; @@ -768,6 +770,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); + int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); -- cgit From 626121fce415960522ed608a4e4949a347c9a8a3 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Wed, 20 Sep 2023 17:05:41 +0800 Subject: drm/amdgpu: update the xgmi ta interface header Update the header file to the v20.00.00.13 v1: rename TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO to TA_COMMAND_XGMI__GET_TOPOLOGY_INFO And also rename struct ta_xgmi_cmd_get_peer_link_info_output to ta_xgmi_cmd_get_peer_link_info accordingly v2: add structs to support xgmi GET_EXTEND_PEER_LINK command Signed-off-by: Shiwu Zhang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/ta_xgmi_if.h | 62 +++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index b153acf9ed05..bd398e10fa0a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1388,7 +1388,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, /* Fill in the shared memory with topology information as input */ topology_info_input = &xgmi_cmd->xgmi_in_message.get_topology_info; - xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO; + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_TOPOLOGY_INFO; topology_info_input->num_nodes = number_devices; for (i = 0; i < topology_info_input->num_nodes; i++) { @@ -1399,7 +1399,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, } /* Invoke xgmi ta to get the topology information */ - ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO); + ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_TOPOLOGY_INFO); if (ret) return ret; @@ -1424,7 +1424,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, /* Invoke xgmi ta again to get the link information */ if (psp_xgmi_peer_link_info_supported(psp)) { - struct ta_xgmi_cmd_get_peer_link_info_output *link_info_output; + struct ta_xgmi_cmd_get_peer_link_info *link_info_output; bool requires_reflection = (psp->xgmi_context.supports_extended_data && get_extended_data) || diff --git a/drivers/gpu/drm/amd/amdgpu/ta_xgmi_if.h b/drivers/gpu/drm/amd/amdgpu/ta_xgmi_if.h index da815a93d46e..d5748032674e 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_xgmi_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_xgmi_if.h @@ -1,5 +1,5 @@ /* - * Copyright 2018 Advanced Micro Devices, Inc. + * Copyright 2018-2022 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,7 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. * */ - #ifndef _TA_XGMI_IF_H #define _TA_XGMI_IF_H @@ -28,20 +27,31 @@ #define RSP_ID_MASK (1U << 31) #define RSP_ID(cmdId) (((uint32_t)(cmdId)) | RSP_ID_MASK) +#define EXTEND_PEER_LINK_INFO_CMD_FLAG 1 + enum ta_command_xgmi { + /* Initialize the Context and Session Topology */ TA_COMMAND_XGMI__INITIALIZE = 0x00, + /* Gets the current GPU's node ID */ TA_COMMAND_XGMI__GET_NODE_ID = 0x01, + /* Gets the current GPU's hive ID */ TA_COMMAND_XGMI__GET_HIVE_ID = 0x02, - TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO = 0x03, + /* Gets the Peer's topology Information */ + TA_COMMAND_XGMI__GET_TOPOLOGY_INFO = 0x03, + /* Sets the Peer's topology Information */ TA_COMMAND_XGMI__SET_TOPOLOGY_INFO = 0x04, - TA_COMMAND_XGMI__GET_PEER_LINKS = 0x0B + /* Gets the total links between adjacent peer dies in hive */ + TA_COMMAND_XGMI__GET_PEER_LINKS = 0x0B, + /* Gets the total links and connected port numbers between adjacent peer dies in hive */ + TA_COMMAND_XGMI__GET_EXTEND_PEER_LINKS = 0x0C }; /* XGMI related enumerations */ /**********************************************************/; -enum ta_xgmi_connected_nodes { - TA_XGMI__MAX_CONNECTED_NODES = 64 -}; +enum { TA_XGMI__MAX_CONNECTED_NODES = 64 }; +enum { TA_XGMI__MAX_INTERNAL_STATE = 32 }; +enum { TA_XGMI__MAX_INTERNAL_STATE_BUFFER = 128 }; +enum { TA_XGMI__MAX_PORT_NUM = 8 }; enum ta_xgmi_status { TA_XGMI_STATUS__SUCCESS = 0x00, @@ -81,6 +91,18 @@ struct ta_xgmi_peer_link_info { uint8_t num_links; }; +struct xgmi_connected_port_num { + uint8_t dst_xgmi_port_num; + uint8_t src_xgmi_port_num; +}; + +/* support both the port num and num_links */ +struct ta_xgmi_extend_peer_link_info { + uint64_t node_id; + uint8_t num_links; + struct xgmi_connected_port_num port_num[TA_XGMI__MAX_PORT_NUM]; +}; + struct ta_xgmi_cmd_initialize_output { uint32_t status; }; @@ -103,16 +125,21 @@ struct ta_xgmi_cmd_get_topology_info_output { struct ta_xgmi_node_info nodes[TA_XGMI__MAX_CONNECTED_NODES]; }; -struct ta_xgmi_cmd_get_peer_link_info_output { +struct ta_xgmi_cmd_set_topology_info_input { uint32_t num_nodes; - struct ta_xgmi_peer_link_info nodes[TA_XGMI__MAX_CONNECTED_NODES]; + struct ta_xgmi_node_info nodes[TA_XGMI__MAX_CONNECTED_NODES]; }; -struct ta_xgmi_cmd_set_topology_info_input { +/* support XGMI TA w/ and w/o port_num both so two similar structs defined */ +struct ta_xgmi_cmd_get_peer_link_info { uint32_t num_nodes; - struct ta_xgmi_node_info nodes[TA_XGMI__MAX_CONNECTED_NODES]; + struct ta_xgmi_peer_link_info nodes[TA_XGMI__MAX_CONNECTED_NODES]; }; +struct ta_xgmi_cmd_get_extend_peer_link_info { + uint32_t num_nodes; + struct ta_xgmi_extend_peer_link_info nodes[TA_XGMI__MAX_CONNECTED_NODES]; +}; /**********************************************************/ /* Common input structure for XGMI callbacks */ union ta_xgmi_cmd_input { @@ -126,16 +153,23 @@ union ta_xgmi_cmd_output { struct ta_xgmi_cmd_get_node_id_output get_node_id; struct ta_xgmi_cmd_get_hive_id_output get_hive_id; struct ta_xgmi_cmd_get_topology_info_output get_topology_info; - struct ta_xgmi_cmd_get_peer_link_info_output get_link_info; + struct ta_xgmi_cmd_get_peer_link_info get_link_info; + struct ta_xgmi_cmd_get_extend_peer_link_info get_extend_link_info; }; -/**********************************************************/ struct ta_xgmi_shared_memory { uint32_t cmd_id; uint32_t resp_id; enum ta_xgmi_status xgmi_status; + + /* if the number of xgmi link record is more than 128, driver will set the + * flag 0 to get the first 128 of the link records and will set to 1, to get + * the second set + */ uint8_t flag_extend_link_record; - uint8_t reserved0[3]; + /* bit0: port_num info support flag for GET_EXTEND_PEER_LINKS commmand */ + uint8_t caps_flag; + uint8_t reserved[2]; union ta_xgmi_cmd_input xgmi_in_message; union ta_xgmi_cmd_output xgmi_out_message; }; -- cgit From d9443ac4f9ea97f9eaebf2569d3fd044da4c9c98 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 17 Oct 2023 19:06:24 +0800 Subject: drm/amdgpu: drop status query/reset for GCEA 9.4.3 and MMEA 1.8 PMFW will be responsible for them. v2: remove query interfaces. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 60 -------------- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 143 -------------------------------- 2 files changed, 203 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index a1c2c952d882..362bf51ab1d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -3754,10 +3754,6 @@ static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ue_reg_list[] = { AMDGPU_GFX_LDS_MEM, 4}, }; -static const struct soc15_reg_entry gfx_v9_4_3_ea_err_status_regs = { - SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 -}; - static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { @@ -3846,39 +3842,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev, mutex_unlock(&adev->grbm_idx_mutex); } -static void gfx_v9_4_3_inst_query_ea_err_status(struct amdgpu_device *adev, - int xcc_id) -{ - uint32_t i, j; - uint32_t reg_value; - - mutex_lock(&adev->grbm_idx_mutex); - - for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) { - for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) { - gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id); - reg_value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), - regGCEA_ERR_STATUS); - if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) || - REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) || - REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) { - dev_warn(adev->dev, - "GCEA err detected at instance: %d, status: 0x%x!\n", - j, reg_value); - } - /* clear after read */ - reg_value = REG_SET_FIELD(reg_value, GCEA_ERR_STATUS, - CLEAR_ERROR_STATUS, 0x1); - WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS, - reg_value); - } - } - - gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); -} - static void gfx_v9_4_3_inst_query_utc_err_status(struct amdgpu_device *adev, int xcc_id) { @@ -3983,7 +3946,6 @@ static void gfx_v9_4_3_inst_query_sq_timeout_status(struct amdgpu_device *adev, static void gfx_v9_4_3_inst_query_ras_err_status(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { - gfx_v9_4_3_inst_query_ea_err_status(adev, xcc_id); gfx_v9_4_3_inst_query_utc_err_status(adev, xcc_id); gfx_v9_4_3_inst_query_sq_timeout_status(adev, xcc_id); } @@ -3996,27 +3958,6 @@ static void gfx_v9_4_3_inst_reset_utc_err_status(struct amdgpu_device *adev, WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 0x3); } -static void gfx_v9_4_3_inst_reset_ea_err_status(struct amdgpu_device *adev, - int xcc_id) -{ - uint32_t i, j; - uint32_t value; - - mutex_lock(&adev->grbm_idx_mutex); - for (i = 0; i < gfx_v9_4_3_ea_err_status_regs.se_num; i++) { - for (j = 0; j < gfx_v9_4_3_ea_err_status_regs.instance; j++) { - gfx_v9_4_3_xcc_select_se_sh(adev, i, 0, j, xcc_id); - value = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS); - value = REG_SET_FIELD(value, GCEA_ERR_STATUS, - CLEAR_ERROR_STATUS, 0x1); - WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGCEA_ERR_STATUS, value); - } - } - gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); -} - static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev, int xcc_id) { @@ -4042,7 +3983,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { gfx_v9_4_3_inst_reset_utc_err_status(adev, xcc_id); - gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id); gfx_v9_4_3_inst_reset_sq_timeout_status(adev, xcc_id); } diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index aa00483e7b37..ea142611be1c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -700,152 +700,9 @@ static void mmhub_v1_8_reset_ras_error_count(struct amdgpu_device *adev) mmhub_v1_8_inst_reset_ras_error_count(adev, i); } -static const u32 mmhub_v1_8_mmea_err_status_reg[] __maybe_unused = { - regMMEA0_ERR_STATUS, - regMMEA1_ERR_STATUS, - regMMEA2_ERR_STATUS, - regMMEA3_ERR_STATUS, - regMMEA4_ERR_STATUS, -}; - -static void mmhub_v1_8_inst_query_ras_err_status(struct amdgpu_device *adev, - uint32_t mmhub_inst) -{ - uint32_t reg_value; - uint32_t mmea_err_status_addr_dist; - uint32_t i; - - /* query mmea ras err status */ - mmea_err_status_addr_dist = regMMEA1_ERR_STATUS - regMMEA0_ERR_STATUS; - for (i = 0; i < ARRAY_SIZE(mmhub_v1_8_mmea_err_status_reg); i++) { - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_ERR_STATUS, - i * mmea_err_status_addr_dist); - if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_STATUS) || - REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_WRRSP_STATUS) || - REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) { - dev_warn(adev->dev, - "Detected MMEA%d err in MMHUB%d, status: 0x%x\n", - i, mmhub_inst, reg_value); - } - } - - /* query mm_cane ras err status */ - reg_value = RREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ERR_STATUS); - if (REG_GET_FIELD(reg_value, MM_CANE_ERR_STATUS, SDPM_RDRSP_STATUS) || - REG_GET_FIELD(reg_value, MM_CANE_ERR_STATUS, SDPM_WRRSP_STATUS) || - REG_GET_FIELD(reg_value, MM_CANE_ERR_STATUS, SDPM_RDRSP_DATAPARITY_ERROR)) { - dev_warn(adev->dev, - "Detected MM CANE err in MMHUB%d, status: 0x%x\n", - mmhub_inst, reg_value); - } -} - -static void mmhub_v1_8_query_ras_error_status(struct amdgpu_device *adev) -{ - uint32_t inst_mask; - uint32_t i; - - if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) { - dev_warn(adev->dev, "MMHUB RAS is not supported\n"); - return; - } - - inst_mask = adev->aid_mask; - for_each_inst(i, inst_mask) - mmhub_v1_8_inst_query_ras_err_status(adev, i); -} - -static void mmhub_v1_8_inst_reset_ras_err_status(struct amdgpu_device *adev, - uint32_t mmhub_inst) -{ - uint32_t mmea_cgtt_clk_cntl_addr_dist; - uint32_t mmea_err_status_addr_dist; - uint32_t reg_value; - uint32_t i; - - /* reset mmea ras err status */ - mmea_cgtt_clk_cntl_addr_dist = regMMEA1_CGTT_CLK_CTRL - regMMEA0_CGTT_CLK_CTRL; - mmea_err_status_addr_dist = regMMEA1_ERR_STATUS - regMMEA0_ERR_STATUS; - for (i = 0; i < ARRAY_SIZE(mmhub_v1_8_mmea_err_status_reg); i++) { - /* force clk branch on for response path - * set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 1 - */ - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist); - reg_value = REG_SET_FIELD(reg_value, MMEA0_CGTT_CLK_CTRL, - SOFT_OVERRIDE_RETURN, 1); - WREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist, - reg_value); - - /* set MMEA0_ERR_STATUS.CLEAR_ERROR_STATUS = 1 */ - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_ERR_STATUS, - i * mmea_err_status_addr_dist); - reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS, - CLEAR_ERROR_STATUS, 1); - WREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_ERR_STATUS, - i * mmea_err_status_addr_dist, - reg_value); - - /* set MMEA0_CGTT_CLK_CTRL.SOFT_OVERRIDE_RETURN = 0 */ - reg_value = RREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist); - reg_value = REG_SET_FIELD(reg_value, MMEA0_CGTT_CLK_CTRL, - SOFT_OVERRIDE_RETURN, 0); - WREG32_SOC15_OFFSET(MMHUB, mmhub_inst, - regMMEA0_CGTT_CLK_CTRL, - i * mmea_cgtt_clk_cntl_addr_dist, - reg_value); - } - - /* reset mm_cane ras err status - * force clk branch on for response path - * set MM_CANE_ICG_CTRL.SOFT_OVERRIDE_ATRET = 1 - */ - reg_value = RREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ICG_CTRL); - reg_value = REG_SET_FIELD(reg_value, MM_CANE_ICG_CTRL, - SOFT_OVERRIDE_ATRET, 1); - WREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ICG_CTRL, reg_value); - - /* set MM_CANE_ERR_STATUS.CLEAR_ERROR_STATUS = 1 */ - reg_value = RREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ERR_STATUS); - reg_value = REG_SET_FIELD(reg_value, MM_CANE_ERR_STATUS, - CLEAR_ERROR_STATUS, 1); - WREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ERR_STATUS, reg_value); - - /* set MM_CANE_ICG_CTRL.SOFT_OVERRIDE_ATRET = 0 */ - reg_value = RREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ICG_CTRL); - reg_value = REG_SET_FIELD(reg_value, MM_CANE_ICG_CTRL, - SOFT_OVERRIDE_ATRET, 0); - WREG32_SOC15(MMHUB, mmhub_inst, regMM_CANE_ICG_CTRL, reg_value); -} - -static void mmhub_v1_8_reset_ras_error_status(struct amdgpu_device *adev) -{ - uint32_t inst_mask; - uint32_t i; - - if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) { - dev_warn(adev->dev, "MMHUB RAS is not supported\n"); - return; - } - - inst_mask = adev->aid_mask; - for_each_inst(i, inst_mask) - mmhub_v1_8_inst_reset_ras_err_status(adev, i); -} - static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = { .query_ras_error_count = mmhub_v1_8_query_ras_error_count, .reset_ras_error_count = mmhub_v1_8_reset_ras_error_count, - .query_ras_error_status = mmhub_v1_8_query_ras_error_status, - .reset_ras_error_status = mmhub_v1_8_reset_ras_error_status, }; struct amdgpu_mmhub_ras mmhub_v1_8_ras = { -- cgit From e8a5ded36b4c68db4e0d4066ae2d420116715105 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Wed, 20 Sep 2023 18:36:55 +0800 Subject: drm/amdgpu: prepare the output buffer for GET_PEER_LINKS command Per the xgmi ta implementation, KGD needs to fill in node_ids in concern into the shared command output buffer rather than the command input buffer. Input buffer is not used for GET_PEER_LINKS command execution. In this way, xgmi ta can reuse the node info in the output buffer just filled in and populate the same buffer with link info directly. Signed-off-by: Shiwu Zhang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index bd398e10fa0a..37891042ce9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1431,14 +1431,22 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, amdgpu_ip_version(psp->adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6); - xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS; + link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info; + /* popluate the shared output buffer rather than the cmd input buffer + * with node_ids as the input for GET_PEER_LINKS command execution. + * This is required for GET_PEER_LINKS only per xgmi ta implementation + */ + for (i = 0; i < topology->num_nodes; i++) { + link_info_output->nodes[i].node_id = topology->nodes[i].node_id; + } + link_info_output->num_nodes = topology->num_nodes; + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS; ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_PEER_LINKS); if (ret) return ret; - link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info; for (i = 0; i < topology->num_nodes; i++) { /* accumulate num_links on extended data */ topology->nodes[i].num_links = get_extended_data ? -- cgit From 723fac64d05d7005929babbeb41dd09fb45f6f35 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Thu, 21 Sep 2023 20:05:12 +0800 Subject: drm/amdgpu: support the port num info based on the capability flag XGMI TA will set the capability flag to indicate whether the port_num info is supported or not. KGD checks the flag and accordingly picks up the right buffer format and send the right command to TA to retrieve the info. v2: simplify the code by reusing the same statement (lijo) Signed-off-by: Shiwu Zhang Acked-by: Lijo Lazar Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 45 +++++++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 + 2 files changed, 33 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 37891042ce9f..648bd5e12830 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1267,6 +1267,8 @@ invoke: xgmi_cmd->cmd_id = TA_COMMAND_XGMI__INITIALIZE; ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id); + /* note down the capbility flag for XGMI TA */ + psp->xgmi_context.xgmi_ta_caps = xgmi_cmd->caps_flag; return ret; } @@ -1425,35 +1427,52 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, /* Invoke xgmi ta again to get the link information */ if (psp_xgmi_peer_link_info_supported(psp)) { struct ta_xgmi_cmd_get_peer_link_info *link_info_output; + struct ta_xgmi_cmd_get_extend_peer_link_info *link_extend_info_output; bool requires_reflection = (psp->xgmi_context.supports_extended_data && get_extended_data) || amdgpu_ip_version(psp->adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6); + bool ta_port_num_support = psp->xgmi_context.xgmi_ta_caps & + EXTEND_PEER_LINK_INFO_CMD_FLAG; - link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info; /* popluate the shared output buffer rather than the cmd input buffer * with node_ids as the input for GET_PEER_LINKS command execution. - * This is required for GET_PEER_LINKS only per xgmi ta implementation + * This is required for GET_PEER_LINKS per xgmi ta implementation. + * The same requirement for GET_EXTEND_PEER_LINKS command. */ - for (i = 0; i < topology->num_nodes; i++) { - link_info_output->nodes[i].node_id = topology->nodes[i].node_id; - } - link_info_output->num_nodes = topology->num_nodes; + if (ta_port_num_support) { + link_extend_info_output = &xgmi_cmd->xgmi_out_message.get_extend_link_info; + + for (i = 0; i < topology->num_nodes; i++) + link_extend_info_output->nodes[i].node_id = topology->nodes[i].node_id; + + link_extend_info_output->num_nodes = topology->num_nodes; + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_EXTEND_PEER_LINKS; + } else { + link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info; - xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS; - ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_PEER_LINKS); + for (i = 0; i < topology->num_nodes; i++) + link_info_output->nodes[i].node_id = topology->nodes[i].node_id; + link_info_output->num_nodes = topology->num_nodes; + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS; + } + + ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id); if (ret) return ret; for (i = 0; i < topology->num_nodes; i++) { + uint8_t node_num_links = ta_port_num_support ? + link_extend_info_output->nodes[i].num_links : link_info_output->nodes[i].num_links; /* accumulate num_links on extended data */ - topology->nodes[i].num_links = get_extended_data ? - topology->nodes[i].num_links + - link_info_output->nodes[i].num_links : - ((requires_reflection && topology->nodes[i].num_links) ? topology->nodes[i].num_links : - link_info_output->nodes[i].num_links); + if (get_extended_data) { + topology->nodes[i].num_links = topology->nodes[i].num_links + node_num_links; + } else { + topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ? + topology->nodes[i].num_links : node_num_links; + } /* reflect the topology information for bi-directionality */ if (requires_reflection && topology->nodes[i].num_hops) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 3e67ed63e638..7111dd32e66f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -189,6 +189,7 @@ struct psp_xgmi_context { struct ta_context context; struct psp_xgmi_topology_info top_info; bool supports_extended_data; + uint8_t xgmi_ta_caps; }; struct psp_ras_context { -- cgit From 2d6a2a28cdeade75021503f86e57e7ebce7eb74c Mon Sep 17 00:00:00 2001 From: André Almeida Date: Fri, 15 Sep 2023 16:38:59 +0200 Subject: drm/amdgpu: Encapsulate all device reset info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To better organize struct amdgpu_device, keep all reset information related fields together in a separated struct. Signed-off-by: André Almeida Signed-off-by: Shashank Sharma Reviewed-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 34 ++++++++++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++------- 3 files changed, 36 insertions(+), 26 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 17c4872a0287..ca94d47cb504 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -773,6 +773,26 @@ struct amdgpu_mqd { struct amdgpu_reset_domain; struct amdgpu_fru_info; +#ifdef CONFIG_DEV_COREDUMP +struct amdgpu_coredump_info { + struct amdgpu_device *adev; + struct amdgpu_task_info reset_task_info; + struct timespec64 reset_time; + bool reset_vram_lost; +}; +#endif + +struct amdgpu_reset_info { + /* reset dump register */ + u32 *reset_dump_reg_list; + u32 *reset_dump_reg_value; + int num_regs; + +#ifdef CONFIG_DEV_COREDUMP + struct amdgpu_coredump_info *coredump_info; +#endif +}; + /* * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise. */ @@ -1081,10 +1101,7 @@ struct amdgpu_device { struct mutex benchmark_mutex; - /* reset dump register */ - uint32_t *reset_dump_reg_list; - uint32_t *reset_dump_reg_value; - int num_regs; + struct amdgpu_reset_info reset_info; bool scpm_enabled; uint32_t scpm_status; @@ -1111,15 +1128,6 @@ static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, return adev->ip_versions[ip][inst] & ~0xFFU; } -#ifdef CONFIG_DEV_COREDUMP -struct amdgpu_coredump_info { - struct amdgpu_device *adev; - struct amdgpu_task_info reset_task_info; - struct timespec64 reset_time; - bool reset_vram_lost; -}; -#endif - static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) { return container_of(ddev, struct amdgpu_device, ddev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index a4faea4fa0b5..3136a0774dd9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -2016,8 +2016,8 @@ static ssize_t amdgpu_reset_dump_register_list_read(struct file *f, if (ret) return ret; - for (i = 0; i < adev->num_regs; i++) { - sprintf(reg_offset, "0x%x\n", adev->reset_dump_reg_list[i]); + for (i = 0; i < adev->reset_info.num_regs; i++) { + sprintf(reg_offset, "0x%x\n", adev->reset_info.reset_dump_reg_list[i]); up_read(&adev->reset_domain->sem); if (copy_to_user(buf + len, reg_offset, strlen(reg_offset))) return -EFAULT; @@ -2074,9 +2074,9 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, if (ret) goto error_free; - swap(adev->reset_dump_reg_list, tmp); - swap(adev->reset_dump_reg_value, new); - adev->num_regs = i; + swap(adev->reset_info.reset_dump_reg_list, tmp); + swap(adev->reset_info.reset_dump_reg_value, new); + adev->reset_info.num_regs = i; up_write(&adev->reset_domain->sem); ret = size; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2d385476939b..c4931d519599 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5048,10 +5048,12 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) lockdep_assert_held(&adev->reset_domain->sem); - for (i = 0; i < adev->num_regs; i++) { - adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], - adev->reset_dump_reg_value[i]); + for (i = 0; i < adev->reset_info.num_regs; i++) { + adev->reset_info.reset_dump_reg_value[i] = + RREG32(adev->reset_info.reset_dump_reg_list[i]); + + trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], + adev->reset_info.reset_dump_reg_value[i]); } return 0; @@ -5089,13 +5091,13 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, if (coredump->reset_vram_lost) drm_printf(&p, "VRAM is lost due to GPU reset!\n"); - if (coredump->adev->num_regs) { + if (coredump->adev->reset_info.num_regs) { drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); - for (i = 0; i < coredump->adev->num_regs; i++) + for (i = 0; i < coredump->adev->reset_info.num_regs; i++) drm_printf(&p, "0x%08x: 0x%08x\n", - coredump->adev->reset_dump_reg_list[i], - coredump->adev->reset_dump_reg_value[i]); + coredump->adev->reset_info.reset_dump_reg_list[i], + coredump->adev->reset_info.reset_dump_reg_value[i]); } return count - iter.remain; -- cgit From 69619868d39bf364721db8d9d2429420704417a3 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Fri, 15 Sep 2023 16:44:16 +0200 Subject: drm/amdgpu: Move coredump code to amdgpu_reset file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Giving that we use codedump just for device resets, move it's functions and structs to a more semantic file, the amdgpu_reset.{c, h}. Signed-off-by: André Almeida Signed-off-by: Shashank Sharma Reviewed-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ----------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 79 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 11 +++++ 4 files changed, 90 insertions(+), 87 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ca94d47cb504..8df702eaa2ad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -773,15 +773,6 @@ struct amdgpu_mqd { struct amdgpu_reset_domain; struct amdgpu_fru_info; -#ifdef CONFIG_DEV_COREDUMP -struct amdgpu_coredump_info { - struct amdgpu_device *adev; - struct amdgpu_task_info reset_task_info; - struct timespec64 reset_time; - bool reset_vram_lost; -}; -#endif - struct amdgpu_reset_info { /* reset dump register */ u32 *reset_dump_reg_list; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c4931d519599..9340e8dc0413 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -32,8 +32,6 @@ #include #include #include -#include -#include #include #include @@ -5059,82 +5057,6 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) return 0; } -#ifndef CONFIG_DEV_COREDUMP -static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, - struct amdgpu_reset_context *reset_context) -{ -} -#else -static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, - size_t count, void *data, size_t datalen) -{ - struct drm_printer p; - struct amdgpu_coredump_info *coredump = data; - struct drm_print_iterator iter; - int i; - - iter.data = buffer; - iter.offset = 0; - iter.start = offset; - iter.remain = count; - - p = drm_coredump_printer(&iter); - - drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); - drm_printf(&p, "kernel: " UTS_RELEASE "\n"); - drm_printf(&p, "module: " KBUILD_MODNAME "\n"); - drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec); - if (coredump->reset_task_info.pid) - drm_printf(&p, "process_name: %s PID: %d\n", - coredump->reset_task_info.process_name, - coredump->reset_task_info.pid); - - if (coredump->reset_vram_lost) - drm_printf(&p, "VRAM is lost due to GPU reset!\n"); - if (coredump->adev->reset_info.num_regs) { - drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); - - for (i = 0; i < coredump->adev->reset_info.num_regs; i++) - drm_printf(&p, "0x%08x: 0x%08x\n", - coredump->adev->reset_info.reset_dump_reg_list[i], - coredump->adev->reset_info.reset_dump_reg_value[i]); - } - - return count - iter.remain; -} - -static void amdgpu_devcoredump_free(void *data) -{ - kfree(data); -} - -static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, - struct amdgpu_reset_context *reset_context) -{ - struct amdgpu_coredump_info *coredump; - struct drm_device *dev = adev_to_drm(adev); - - coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); - - if (!coredump) { - DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); - return; - } - - coredump->reset_vram_lost = vram_lost; - - if (reset_context->job && reset_context->job->vm) - coredump->reset_task_info = reset_context->job->vm->task_info; - - coredump->adev = adev; - - ktime_get_ts64(&coredump->reset_time); - - dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, - amdgpu_devcoredump_read, amdgpu_devcoredump_free); -} -#endif - int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 970bfece775c..f53ddca35be6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -21,6 +21,9 @@ * */ +#include +#include + #include "amdgpu_reset.h" #include "aldebaran.h" #include "sienna_cichlid.h" @@ -159,5 +162,81 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) up_write(&reset_domain->sem); } +#ifndef CONFIG_DEV_COREDUMP +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, + struct amdgpu_reset_context *reset_context) +{ +} +#else +static ssize_t +amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, + void *data, size_t datalen) +{ + struct drm_printer p; + struct amdgpu_coredump_info *coredump = data; + struct drm_print_iterator iter; + int i; + + iter.data = buffer; + iter.offset = 0; + iter.start = offset; + iter.remain = count; + + p = drm_coredump_printer(&iter); + + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); + drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, + coredump->reset_time.tv_nsec); + + if (coredump->reset_task_info.pid) + drm_printf(&p, "process_name: %s PID: %d\n", + coredump->reset_task_info.process_name, + coredump->reset_task_info.pid); + + if (coredump->reset_vram_lost) + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); + if (coredump->adev->reset_info.num_regs) { + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); + + for (i = 0; i < coredump->adev->reset_info.num_regs; i++) + drm_printf(&p, "0x%08x: 0x%08x\n", + coredump->adev->reset_info.reset_dump_reg_list[i], + coredump->adev->reset_info.reset_dump_reg_value[i]); + } + + return count - iter.remain; +} +static void amdgpu_devcoredump_free(void *data) +{ + kfree(data); +} +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, + struct amdgpu_reset_context *reset_context) +{ + struct amdgpu_coredump_info *coredump; + struct drm_device *dev = adev_to_drm(adev); + + coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); + + if (!coredump) { + DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); + return; + } + + coredump->reset_vram_lost = vram_lost; + + if (reset_context->job && reset_context->job->vm) + coredump->reset_task_info = reset_context->job->vm->task_info; + + coredump->adev = adev; + + ktime_get_ts64(&coredump->reset_time); + + dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, + amdgpu_devcoredump_read, amdgpu_devcoredump_free); +} +#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 471d789b33a5..f4cfa2c7c9d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -89,6 +89,14 @@ struct amdgpu_reset_domain { atomic_t reset_res; }; +#ifdef CONFIG_DEV_COREDUMP +struct amdgpu_coredump_info { + struct amdgpu_device *adev; + struct amdgpu_task_info reset_task_info; + struct timespec64 reset_time; + bool reset_vram_lost; +}; +#endif int amdgpu_reset_init(struct amdgpu_device *adev); int amdgpu_reset_fini(struct amdgpu_device *adev); @@ -130,6 +138,9 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain); void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, + struct amdgpu_reset_context *reset_context); + #define for_each_handler(i, handler, reset_ctl) \ for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \ (handler = (*reset_ctl->reset_handlers)[i]); \ -- cgit From de009982c6aa8363b2bc8800fb0a13896d264853 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Fri, 15 Sep 2023 18:44:53 +0200 Subject: drm/amdgpu: Create version number for coredumps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if there's nothing currently parsing amdgpu's coredump files, if we eventually have such tools they will be glad to find a version field to properly read the file. Create a version number to be displayed on top of coredump file, to be incremented when the file format or content get changed. Signed-off-by: André Almeida Reviewed-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index f53ddca35be6..4baa300121d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -185,6 +185,7 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, p = drm_coredump_printer(&iter); drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); + drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); drm_printf(&p, "kernel: " UTS_RELEASE "\n"); drm_printf(&p, "module: " KBUILD_MODNAME "\n"); drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index f4cfa2c7c9d6..b0335a1c5e90 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -90,6 +90,9 @@ struct amdgpu_reset_domain { }; #ifdef CONFIG_DEV_COREDUMP + +#define AMDGPU_COREDUMP_VERSION "1" + struct amdgpu_coredump_info { struct amdgpu_device *adev; struct amdgpu_task_info reset_task_info; -- cgit