diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-01 15:24:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-01 15:24:20 -0700 |
commit | f365ab31efacb70bed1e821f7435626e0b2528a6 (patch) | |
tree | e1374b2896d50e652c1e434d70e834d0788aae3a /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | |
parent | 4646de87d32526ee87b46c2e0130413367fb5362 (diff) | |
parent | 59e7a8cc2dcf335116d500d684bfb34d1d97a6fe (diff) |
Merge tag 'drm-next-2020-04-01' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie:
"This is the main drm pull request for 5.7-rc1.
Highlights:
- i915 enables Tigerlake by default
- i915 and amdgpu have initial OLED backlight support
[ Jani Nikula pipes up and points out that we've had a bunch of
"initial support" code for a long time already, but only now
Lyude made it actually work on real world machines ]
- vmwgfx add support to enable OpenGL 4 userspace
- zero length arrays are mostly removed.
Detailed summary:
new driver:
- tidss: TI Keystone platform display subsystem
core:
- new drm device warn macros
- mode config valid for memory constrained devices
- bridge bus format negotation
- consolidated fake vblank event handling
- dma_alloc related cleanups
- drop get_crtc callback
- dp: DP1.4 EDID corruption test
- EDID CEA detailed timings improvements
- relicense some code to dual GPL2/MIT
- convert core vblank support to per-crtc support
- rework drm_global_mutex
- bridge rework to allow omap_dss custom driver removeal
- remove drm_fb_helper connector interrfaces
- zero-length array removal
scheduler:
- support for modifying the sched list
- revert job distribution optimization
- helper to pick least loaded scheduler
- race condition fix
mst:
- various fixes
- remove register_connector callback
i915:
- uapi to allows userspace specific CS ring buffer sizes
- Tigerlake enablement patches + Tigerlake enabled by default
- new sysfs entries for engine properties
- display/logging refactors
- eDP/DP fixes for DPCD
- Gen7 back to aliasing-ppgtt
- Gen8+ irq refactor
- Avoid globals
- GEM locking fixes and simplifications
- Ice Lake and Elkhart Lake fixes and workarounds
- Baytrail/Haswell instability fix
- GVT - VFIO edid better support
amdgpu:
- Rework VM update handling in preparation for HMM support
- drm load/unload removal fixups
- USB-C PD firmware updates
- HDCP srm support
- Navi/renoir PM watermark fixes
- OLED panel support
- Optimize debugging vram access
- Use BACO for runtime pm
- DC clock programming optimizations and fixes
- PSP fw loading sequence updates
- Drop DRIVER_USE_AGP
- Remove legacy drm load and unload callbacks
- ACP Kconfig fix
- Lots of fixes across the driver
amdkfd:
- runtime pm support
- more gfx config details in amdgpu
radeon:
- drop DRIVER_USE_AGP
vmwgfx:
- Disable DMA when SEV encryption in use
- Shader Model 5 support - needed for GL4 support
msm:
- DPU resource manager refactor
- dpu using atomic global state
mediatek:
- MT8183 DPI support
etnaviv:
- out-of-bounds read fix
- expose feature flags for GC400 STM32MP1 SoC
- runtime suspend entry fix
- dma32 zone fix
hisilicon:
- mode selection fixes
meson:
- YUV420 support
lima:
- add support for heap buffers
tinydrm:
- removal of owner field
- explicit DT dependency removal
- YAML schema conversion
tegra:
- misc cleanups
tidss:
- new driver
virtio:
- better batching of notifications to host
- memory handling reworked
- shmem + gpu context fixes
hibmc:
- add gamma_set support
- improve DPMS support
pl111:
- Integrator IM-PD1 support
sun4i:
- LVDS support for A20 + A33
- DSI panel handling improvements"
* tag 'drm-next-2020-04-01' of git://anongit.freedesktop.org/drm/drm: (1537 commits)
drm/i915/display: Fix mode private_flags comparison at atomic_check
drm/i915/gt: Stage the transfer of the virtual breadcrumb
drm/i915/gt: Select the deepest available parking mode for rc6
drm/i915: Avoid live-lock with i915_vma_parked()
drm/i915/gt: Treat idling as a RPS downclock event
drm/i915/gt: Cancel a hung context if already closed
drm/i915: Use explicit flag to mark unreachable intel_context
drm/amdgpu: don't try to reserve training bo for sriov (v2)
drm/amdgpu/smu11: add support for SMU AC/DC interrupts
drm/amdgpu/swSMU: handle manual AC/DC notifications
drm/amdgpu/swSMU: handle DC controlled by GPIO for navi1x
drm/amdgpu/swSMU: set AC/DC mode based on the current system state (v2)
drm/amdgpu/swSMU: correct the bootup power source for Navi1X (v2)
drm/amdgpu/swSMU: use the smu11 power source helper for navi1x
drm/amdgpu/smu11: add a helper to set the power source
drm/amd/swSMU: add callback to set AC/DC power source (v2)
drm/scheduler: fix rare NULL ptr race
drm/amdgpu: fix the coverage issue to clear ArcVPGRs
drm/amd/display: Fix pageflip event race condition for DCN.
drm/[radeon|amdgpu]: Remove HAINAN board from max_sclk override check
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 250 |
1 files changed, 246 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index a97af422575a..95b3327168ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -26,7 +26,12 @@ #include "amdgpu_xgmi.h" #include "amdgpu_smu.h" #include "amdgpu_ras.h" +#include "soc15.h" #include "df/df_3_6_offset.h" +#include "xgmi/xgmi_4_0_0_smn.h" +#include "xgmi/xgmi_4_0_0_sh_mask.h" +#include "wafl/wafl2_4_0_0_smn.h" +#include "wafl/wafl2_4_0_0_sh_mask.h" static DEFINE_MUTEX(xgmi_mutex); @@ -36,6 +41,109 @@ static DEFINE_MUTEX(xgmi_mutex); static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; static unsigned hive_count = 0; +static const int xgmi_pcs_err_status_reg_vg20[] = { + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, +}; + +static const int wafl_pcs_err_status_reg_vg20[] = { + smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, + smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, +}; + +static const int xgmi_pcs_err_status_reg_arct[] = { + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, + smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, +}; + +/* same as vg20*/ +static const int wafl_pcs_err_status_reg_arct[] = { + smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, + smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, +}; + +static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { + {"XGMI PCS DataLossErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, + {"XGMI PCS TrainingErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, + {"XGMI PCS CRCErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, + {"XGMI PCS BERExceededErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, + {"XGMI PCS TxMetaDataErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, + {"XGMI PCS ReplayBufParityErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, + {"XGMI PCS DataParityErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, + {"XGMI PCS ReplayFifoOverflowErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, + {"XGMI PCS ReplayFifoUnderflowErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, + {"XGMI PCS ElasticFifoOverflowErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, + {"XGMI PCS DeskewErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, + {"XGMI PCS DataStartupLimitErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, + {"XGMI PCS FCInitTimeoutErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, + {"XGMI PCS RecoveryTimeoutErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, + {"XGMI PCS ReadySerialTimeoutErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, + {"XGMI PCS ReadySerialAttemptErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, + {"XGMI PCS RecoveryAttemptErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, + {"XGMI PCS RecoveryRelockAttemptErr", + SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, +}; + +static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { + {"WAFL PCS DataLossErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, + {"WAFL PCS TrainingErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, + {"WAFL PCS CRCErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, + {"WAFL PCS BERExceededErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, + {"WAFL PCS TxMetaDataErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, + {"WAFL PCS ReplayBufParityErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, + {"WAFL PCS DataParityErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, + {"WAFL PCS ReplayFifoOverflowErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, + {"WAFL PCS ReplayFifoUnderflowErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, + {"WAFL PCS ElasticFifoOverflowErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, + {"WAFL PCS DeskewErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, + {"WAFL PCS DataStartupLimitErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, + {"WAFL PCS FCInitTimeoutErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, + {"WAFL PCS RecoveryTimeoutErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, + {"WAFL PCS ReadySerialTimeoutErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, + {"WAFL PCS ReadySerialAttemptErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, + {"WAFL PCS RecoveryAttemptErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, + {"WAFL PCS RecoveryRelockAttemptErr", + SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, +}; + void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) { return &hive->device_list; @@ -365,6 +473,13 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) return 0; if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + ret = psp_xgmi_initialize(&adev->psp); + if (ret) { + dev_err(adev->dev, + "XGMI: Failed to initialize xgmi session\n"); + return ret; + } + ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); if (ret) { dev_err(adev->dev, @@ -451,16 +566,16 @@ exit: return ret; } -void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) +int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) { struct amdgpu_hive_info *hive; if (!adev->gmc.xgmi.supported) - return; + return -EINVAL; hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) - return; + return -EINVAL; if (!(hive->number_devices--)) { amdgpu_xgmi_sysfs_destroy(adev, hive); @@ -471,6 +586,8 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); mutex_unlock(&hive->hive_lock); } + + return psp_xgmi_terminate(&adev->psp); } int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) @@ -481,7 +598,6 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) }; struct ras_fs_if fs_info = { .sysfs_name = "xgmi_wafl_err_count", - .debugfs_name = "xgmi_wafl_err_inject", }; if (!adev->gmc.xgmi.supported || @@ -521,3 +637,129 @@ void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) kfree(ras_if); } } + +uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, + uint64_t addr) +{ + uint32_t df_inst_id; + uint64_t dram_base_addr = 0; + const struct amdgpu_df_funcs *df_funcs = adev->df.funcs; + + if ((!df_funcs) || + (!df_funcs->get_df_inst_id) || + (!df_funcs->get_dram_base_addr)) { + dev_warn(adev->dev, + "XGMI: relative phy_addr algorithm is not supported\n"); + return addr; + } + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) { + dev_warn(adev->dev, + "failed to disable DF-Cstate, DF register may not be accessible\n"); + return addr; + } + + df_inst_id = df_funcs->get_df_inst_id(adev); + dram_base_addr = df_funcs->get_dram_base_addr(adev, df_inst_id); + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) + dev_warn(adev->dev, "failed to enable DF-Cstate\n"); + + return addr + dram_base_addr; +} + +static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, + uint32_t value, + uint32_t *ue_count, + uint32_t *ce_count, + bool is_xgmi_pcs) +{ + int i; + int ue_cnt; + + if (is_xgmi_pcs) { + /* query xgmi pcs error status, + * only ue is supported */ + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) { + ue_cnt = (value & + xgmi_pcs_ras_fields[i].pcs_err_mask) >> + xgmi_pcs_ras_fields[i].pcs_err_shift; + if (ue_cnt) { + dev_info(adev->dev, "%s detected\n", + xgmi_pcs_ras_fields[i].err_name); + *ue_count += ue_cnt; + } + } + } else { + /* query wafl pcs error status, + * only ue is supported */ + for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) { + ue_cnt = (value & + wafl_pcs_ras_fields[i].pcs_err_mask) >> + wafl_pcs_ras_fields[i].pcs_err_shift; + if (ue_cnt) { + dev_info(adev->dev, "%s detected\n", + wafl_pcs_ras_fields[i].err_name); + *ue_count += ue_cnt; + } + } + } + + return 0; +} + +int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + int i; + uint32_t data; + uint32_t ue_cnt = 0, ce_cnt = 0; + + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) + return -EINVAL; + + err_data->ue_count = 0; + err_data->ce_count = 0; + + switch (adev->asic_type) { + case CHIP_ARCTURUS: + /* check xgmi pcs error */ + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { + data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); + if (data) + amdgpu_xgmi_query_pcs_error_status(adev, + data, &ue_cnt, &ce_cnt, true); + } + /* check wafl pcs error */ + for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { + data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); + if (data) + amdgpu_xgmi_query_pcs_error_status(adev, + data, &ue_cnt, &ce_cnt, false); + } + break; + case CHIP_VEGA20: + default: + /* check xgmi pcs error */ + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { + data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); + if (data) + amdgpu_xgmi_query_pcs_error_status(adev, + data, &ue_cnt, &ce_cnt, true); + } + /* check wafl pcs error */ + for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { + data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); + if (data) + amdgpu_xgmi_query_pcs_error_status(adev, + data, &ue_cnt, &ce_cnt, false); + } + break; + } + + err_data->ue_count += ue_cnt; + err_data->ce_count += ce_cnt; + + return 0; +} |