From fb74b205cdd26357469cab8957f5935f10b810e2 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Tue, 23 Apr 2024 18:18:14 -0400 Subject: drm/xe: Introduce a simple wedged state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a very simple 'wedged' state where any attempt to access the GPU is entirely blocked. On some critical cases, like on gt_reset failure, we need to block any other attempt to use the GPU. Otherwise we are at a risk of reaching cases that would force us to reboot the machine. So, when this cases are identified we corner and block any GPU access. No IOCTL and not even another GT reset should be attempted. The 'wedged' state in Xe is an end state with no way back. Only a device "re-probe" (unbind + bind) can restore the GPU access. v2: - s/wedged/busted (Lucas) - use unbind+bind instead of module reload (Lucas) - added more info on unbind operations and instruction on bug report - only print the message once. v3: - s/busted/wedged (Ashutosh, Tvrtko, Thomas) - don't assume user has sudo and tee available (Lucas) v4: - remove unnecessary cases around ct communication or migration. Cc: Ashutosh Dixit Cc: Tvrtko Ursulin Cc: Thomas Hellström Cc: Lucas De Marchi Cc: Anshuman Gupta Reviewed-by: Himal Prasad Ghimiray Reviewed-by: Lucas De Marchi #v2 Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'drivers/gpu/drm/xe/xe_device.h') diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 36d4434ebccc..d2e4249d37ce 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -167,4 +167,24 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p); u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address); u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address); +static inline bool xe_device_wedged(struct xe_device *xe) +{ + return atomic_read(&xe->wedged); +} + +static inline void xe_device_declare_wedged(struct xe_device *xe) +{ + if (!atomic_xchg(&xe->wedged, 1)) { + xe->needs_flr_on_fini = true; + drm_err(&xe->drm, + "CRITICAL: Xe has declared device %s as wedged.\n" + "IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n" + "echo '%s' > /sys/bus/pci/drivers/xe/unbind\n" + "echo '%s' > /sys/bus/pci/drivers/xe/bind\n" + "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", + dev_name(xe->drm.dev), dev_name(xe->drm.dev), + dev_name(xe->drm.dev)); + } +} + #endif -- cgit From 8ed9aaae39f39130b7a3eb2726be05d7f64b344c Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Tue, 23 Apr 2024 18:18:16 -0400 Subject: drm/xe: Force wedged state and block GT reset upon any GPU hang In many validation situations when debugging GPU Hangs, it is useful to preserve the GT situation from the moment that the timeout occurred. This patch introduces a module parameter that could be used on situations like this. If xe.wedged module parameter is set to 2, Xe will be declared wedged on every single execution timeout (a.k.a. GPU hang) right after devcoredump snapshot capture and without attempting any kind of GT reset and blocking entirely any kind of execution. v2: Really block gt_reset from guc side. (Lucas) s/wedged/busted (Lucas) v3: - s/busted/wedged - Really use global_flags (Dafna) - More robust timeout handling when wedging it. v4: A really robust clean exit done by Matt Brost. No more kernel warns on unbind. v5: Simplify error message (Lucas) Cc: Matthew Brost Cc: Dafna Hirschfeld Cc: Lucas De Marchi Cc: Alan Previn Cc: Himanshu Somaiya Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-3-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_device.h') diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index d2e4249d37ce..9ede45fc062a 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe) return atomic_read(&xe->wedged); } -static inline void xe_device_declare_wedged(struct xe_device *xe) -{ - if (!atomic_xchg(&xe->wedged, 1)) { - xe->needs_flr_on_fini = true; - drm_err(&xe->drm, - "CRITICAL: Xe has declared device %s as wedged.\n" - "IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n" - "echo '%s' > /sys/bus/pci/drivers/xe/unbind\n" - "echo '%s' > /sys/bus/pci/drivers/xe/bind\n" - "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", - dev_name(xe->drm.dev), dev_name(xe->drm.dev), - dev_name(xe->drm.dev)); - } -} +void xe_device_declare_wedged(struct xe_device *xe); #endif -- cgit From 6b8ef44cc0a952549a6773a0233cee853f807a79 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Tue, 23 Apr 2024 18:18:17 -0400 Subject: drm/xe: Introduce the wedged_mode debugfs So, the wedged mode can be selected per device at runtime, before the tests or before reproducing the issue. v2: - s/busted/wedged - some locking consistency v3: - remove mutex - toggle guc reset policy on any mode change Cc: Lucas De Marchi Cc: Alan Previn Cc: Himal Prasad Ghimiray Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-4-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/xe/xe_device.h') diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 9ede45fc062a..82317580f4bf 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -169,7 +169,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address); static inline bool xe_device_wedged(struct xe_device *xe) { - return atomic_read(&xe->wedged); + return atomic_read(&xe->wedged.flag); } void xe_device_declare_wedged(struct xe_device *xe); -- cgit From b7f6318a9c3d9c79b724b20ff5382775a9c58346 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Tue, 7 May 2024 13:09:59 +0200 Subject: drm/xe: Fix xe_device.h Some explicit includes are needed only from the xe_device.c. And there is no need for redundant forward declarations. Signed-off-by: Michal Wajdeczko Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240507110959.2747-4-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_device.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_device.h') diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index 82317580f4bf..3ed14072d8d1 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -6,15 +6,9 @@ #ifndef _XE_DEVICE_H_ #define _XE_DEVICE_H_ -struct xe_exec_queue; -struct xe_file; - #include -#include "regs/xe_gpu_commands.h" #include "xe_device_types.h" -#include "xe_force_wake.h" -#include "xe_macros.h" static inline struct xe_device *to_xe_device(const struct drm_device *dev) { -- cgit