diff options
-rw-r--r-- | drivers/gpu/drm/xe/Makefile | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/regs/xe_hw_error_regs.h | 15 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/regs/xe_irq_regs.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_hw_error.c | 109 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_hw_error.h | 15 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_irq.c | 4 |
6 files changed, 145 insertions, 0 deletions
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index c52cac979e05..0abd8078cfb0 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -82,6 +82,7 @@ xe-y += xe_bb.o \ xe_hw_engine.o \ xe_hw_engine_class_sysfs.o \ xe_hw_engine_group.o \ + xe_hw_error.o \ xe_hw_fence.o \ xe_irq.o \ xe_lrc.o \ diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h new file mode 100644 index 000000000000..ed9b81fb28a0 --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_HW_ERROR_REGS_H_ +#define _XE_HW_ERROR_REGS_H_ + +#define DEV_ERR_STAT_NONFATAL 0x100178 +#define DEV_ERR_STAT_CORRECTABLE 0x10017c +#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ + DEV_ERR_STAT_CORRECTABLE, \ + DEV_ERR_STAT_NONFATAL)) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h b/drivers/gpu/drm/xe/regs/xe_irq_regs.h index 13635e4331d4..7c2a3a140142 100644 --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h @@ -18,6 +18,7 @@ #define GFX_MSTR_IRQ XE_REG(0x190010, XE_REG_OPTION_VF) #define MASTER_IRQ REG_BIT(31) #define GU_MISC_IRQ REG_BIT(29) +#define ERROR_IRQ(x) REG_BIT(26 + (x)) #define DISPLAY_IRQ REG_BIT(16) #define I2C_IRQ REG_BIT(12) #define GT_DW_IRQ(x) REG_BIT(x) diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c new file mode 100644 index 000000000000..0fa45792f1c0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include "regs/xe_hw_error_regs.h" +#include "regs/xe_irq_regs.h" + +#include "xe_device.h" +#include "xe_hw_error.h" +#include "xe_mmio.h" + +/* Error categories reported by hardware */ +enum hardware_error { + HARDWARE_ERROR_CORRECTABLE = 0, + HARDWARE_ERROR_NONFATAL = 1, + HARDWARE_ERROR_FATAL = 2, + HARDWARE_ERROR_MAX, +}; + +static const char *hw_error_to_str(const enum hardware_error hw_err) +{ + switch (hw_err) { + case HARDWARE_ERROR_CORRECTABLE: + return "CORRECTABLE"; + case HARDWARE_ERROR_NONFATAL: + return "NONFATAL"; + case HARDWARE_ERROR_FATAL: + return "FATAL"; + default: + return "UNKNOWN"; + } +} + +static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) +{ + const char *hw_err_str = hw_error_to_str(hw_err); + struct xe_device *xe = tile_to_xe(tile); + unsigned long flags; + u32 err_src; + + if (xe->info.platform != XE_BATTLEMAGE) + return; + + spin_lock_irqsave(&xe->irq.lock, flags); + err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err)); + if (!err_src) { + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n", + tile->id, hw_err_str); + goto unlock; + } + + /* TODO: Process errrors per source */ + + xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); + +unlock: + spin_unlock_irqrestore(&xe->irq.lock, flags); +} + +/** + * xe_hw_error_irq_handler - irq handling for hw errors + * @tile: tile instance + * @master_ctl: value read from master interrupt register + * + * Xe platforms add three error bits to the master interrupt register to support error handling. + * These three bits are used to convey the class of error FATAL, NONFATAL, or CORRECTABLE. + * To process the interrupt, determine the source of error by reading the Device Error Source + * Register that corresponds to the class of error being serviced. + */ +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) +{ + enum hardware_error hw_err; + + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) + if (master_ctl & ERROR_IRQ(hw_err)) + hw_error_source_handler(tile, hw_err); +} + +/* + * Process hardware errors during boot + */ +static void process_hw_errors(struct xe_device *xe) +{ + struct xe_tile *tile; + u32 master_ctl; + u8 id; + + for_each_tile(tile, xe, id) { + master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ); + xe_hw_error_irq_handler(tile, master_ctl); + xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl); + } +} + +/** + * xe_hw_error_init - Initialize hw errors + * @xe: xe device instance + * + * Initialize and check for errors that occurred during boot + * prior to driver load + */ +void xe_hw_error_init(struct xe_device *xe) +{ + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) + return; + + process_hw_errors(xe); +} diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h new file mode 100644 index 000000000000..d86e28c5180c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_error.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ +#ifndef XE_HW_ERROR_H_ +#define XE_HW_ERROR_H_ + +#include <linux/types.h> + +struct xe_tile; +struct xe_device; + +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl); +void xe_hw_error_init(struct xe_device *xe); +#endif diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 5df5b8c2a3e4..870edaf69388 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -18,6 +18,7 @@ #include "xe_gt.h" #include "xe_guc.h" #include "xe_hw_engine.h" +#include "xe_hw_error.h" #include "xe_i2c.h" #include "xe_memirq.h" #include "xe_mmio.h" @@ -468,6 +469,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg) xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl); gt_irq_handler(tile, master_ctl, intr_dw, identity); + xe_hw_error_irq_handler(tile, master_ctl); /* * Display interrupts (including display backlight operations @@ -756,6 +758,8 @@ int xe_irq_install(struct xe_device *xe) int nvec = 1; int err; + xe_hw_error_init(xe); + xe_irq_reset(xe); if (xe_device_has_msix(xe)) { |