From df9c8d1aa278c435c30a69b8f2418b4a52fcb929 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 8 Jul 2020 15:07:13 +0800 Subject: drm/amdgpu: fix system hang issue during GPU reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the atomic adev->in_gpu_reset and hive->in_reset are used to avoid re-entering GPU recovery. During GPU reset and resume, it is unsafe that other threads access GPU, which maybe cause GPU reset failed. Therefore the new rw_semaphore adev->reset_sem is introduced, which protect GPU from being accessed by external threads during recovery. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. v3: 1. change back to use adev->reset_sem to protect kfd callback functions, because dqm_lock couldn't protect all codes, for example: free_mqd must be called outside of dqm_lock; [ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 1230.177221] Call Trace: [ 1230.178249] dump_stack+0x98/0xd5 [ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu] [ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu] [ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu] [ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu] [ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm] [ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm] [ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm] [ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm] [ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm] [ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm] [ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu] [ 1230.193833] free_mqd+0x25/0x40 [amdgpu] [ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu] [ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu] [ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu] [ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu] [ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu] [ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20 [ 1230.202831] ksys_ioctl+0x98/0xb0 [ 1230.204004] __x64_sys_ioctl+0x1a/0x20 [ 1230.205174] do_syscall_64+0x5f/0x250 [ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe 2. remove try_lock and introduce atomic hive->in_reset, to avoid re-enter GPU recovery. v4: 1. remove an unnecessary whitespace change in kfd_chardev.c 2. remove comment codes in amdgpu_device.c 3. add more detailed comment in commit message 4. define a wrap function amdgpu_in_reset v5: 1. Fix some style issues. Reviewed-by: Hawking Zhang Suggested-by: Andrey Grodzovsky Suggested-by: Christian König Suggested-by: Felix Kuehling Suggested-by: Lijo Lazar Suggested-by: Luben Tukov Signed-off-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a512ccbc4dea..a3b150304dae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) parser.adev = adev; parser.filp = filp; + down_read(&adev->reset_sem); + r = amdgpu_cs_parser_init(&parser, data); if (r) { DRM_ERROR("Failed to initialize parser %d!\n", r); @@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) out: amdgpu_cs_parser_fini(&parser, r, reserved_buffers); + up_read(&adev->reset_sem); + return r; } -- cgit From f1403342ebdfcff3c3cf57ae476f19d3078f2767 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 12 Aug 2020 17:48:26 +0200 Subject: drm/amdgpu: revert "fix system hang issue during GPU reset" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The whole approach wasn't thought through till the end. We already had a reset lock like this in the past and it caused the same problems like this one. Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary. This reverts commit df9c8d1aa278c435c30a69b8f2418b4a52fcb929. Signed-off-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a3b150304dae..a512ccbc4dea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1292,8 +1292,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) parser.adev = adev; parser.filp = filp; - down_read(&adev->reset_sem); - r = amdgpu_cs_parser_init(&parser, data); if (r) { DRM_ERROR("Failed to initialize parser %d!\n", r); @@ -1333,8 +1331,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) out: amdgpu_cs_parser_fini(&parser, r, reserved_buffers); - up_read(&adev->reset_sem); - return r; } -- cgit From 4444457450fa66860fca856c6dd20f7acde80efa Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 17 Aug 2020 16:57:20 +0800 Subject: drm/amdgpu: add condition check for trace_amdgpu_cs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v1: add trace event enabled check to avoid nop loop when submit multi ibs in amdgpu_cs_ioctl() function. v2: add a new wrapper function to trace all amdgpu cs ibs. Signed-off-by: Kevin Wang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a512ccbc4dea..10c0779aba73 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1275,13 +1275,24 @@ error_unlock: return r; } +static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser) +{ + int i; + + if (!trace_amdgpu_cs_enabled()) + return; + + for (i = 0; i < parser->job->num_ibs; i++) + trace_amdgpu_cs(parser, i); +} + int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct amdgpu_device *adev = dev->dev_private; union drm_amdgpu_cs *cs = data; struct amdgpu_cs_parser parser = {}; bool reserved_buffers = false; - int i, r; + int r; if (amdgpu_ras_intr_triggered()) return -EHWPOISON; @@ -1319,8 +1330,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) reserved_buffers = true; - for (i = 0; i < parser.job->num_ibs; i++) - trace_amdgpu_cs(&parser, i); + trace_amdgpu_cs_ibs(&parser); r = amdgpu_cs_vm_handling(&parser); if (r) -- cgit From 8e1d88f948f33f2e34dbf13c3e9fc4228f1331af Mon Sep 17 00:00:00 2001 From: jqdeng Date: Thu, 30 Jul 2020 14:48:40 +0800 Subject: drm/amdgpu: Limit the error info print rate Use function printk_ratelimit to limit the print rate. Signed-off-by: jqdeng Acked-by: Nirmoy Das Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 10c0779aba73..e84a6e0bc784 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1305,7 +1305,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) r = amdgpu_cs_parser_init(&parser, data); if (r) { - DRM_ERROR("Failed to initialize parser %d!\n", r); + if (printk_ratelimit()) + DRM_ERROR("Failed to initialize parser %d!\n", r); goto out; } -- cgit From 1348969ab68cb864034ec4fe48a86c157cc4e10d Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 24 Aug 2020 12:27:47 -0400 Subject: drm/amdgpu: drm_device to amdgpu_device by inline-f (v2) Get the amdgpu_device from the DRM device by use of an inline function, drm_to_adev(). The inline function resolves a pointer to struct drm_device to a pointer to struct amdgpu_device. v2: Use a typed visible static inline function instead of an invisible macro. Signed-off-by: Luben Tuikov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index e84a6e0bc784..690648785b3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1288,7 +1288,7 @@ static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser) int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_cs *cs = data; struct amdgpu_cs_parser parser = {}; bool reserved_buffers = false; @@ -1432,7 +1432,7 @@ static struct dma_fence *amdgpu_cs_get_fence(struct amdgpu_device *adev, int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_fence_to_handle *info = data; struct dma_fence *fence; struct drm_syncobj *syncobj; @@ -1608,7 +1608,7 @@ err_free_fence_array: int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_wait_fences *wait = data; uint32_t fence_count = wait->in.fence_count; struct drm_amdgpu_fence *fences_user; -- cgit