diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 19:50:06 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 19:50:06 -0700 |
commit | 8186749621ed6b8fc42644c399e8c755a2b6f630 (patch) | |
tree | 3a1db67415da013e5dd481367c77db21e491edfb /drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | |
parent | e4a7b2dc35d9582c253cf5e6d6c3605aabc7284d (diff) | |
parent | dc100bc8fae59aafd2ea2e1a1a43ef1f65f8a8bc (diff) |
Merge tag 'drm-next-2020-08-06' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie:
"New xilinx displayport driver, AMD support for two new GPUs (more
header files), i915 initial support for RocketLake and some work on
their DG1 (discrete chip).
The core also grew some lockdep annotations to try and constrain what
drivers do with dma-fences, and added some documentation on why the
idea of indefinite fences doesn't work.
The long list is below.
I do have some fixes trees outstanding, but I'll follow up with those
later.
core:
- add user def flag to cmd line modes
- dma_fence_wait added might_sleep
- dma-fence lockdep annotations
- indefinite fences are bad documentation
- gem CMA functions used in more drivers
- struct mutex removal
- more drm_ debug macro usage
- set/drop master api fixes
- fix for drm/mm hole size comparison
- drm/mm remove invalid entry optimization
- optimise drm/mm hole handling
- VRR debugfs added
- uncompressed AFBC modifier support
- multiple display id blocks in EDID
- multiple driver sg handling fixes
- __drm_atomic_helper_crtc_reset in all drivers
- managed vram helpers
ttm:
- ttm_mem_reg handling cleanup
- remove bo offset field
- drop CMA memtype flag
- drop mappable flag
xilinx:
- New Xilinx ZynqMP DisplayPort Subsystem driver
nouveau:
- add CRC support
- start using NVIDIA published class header files
- convert all push buffer emission to new macros
- Proper push buffer space management for EVO/NVD channels.
- firmware loading fixes
- 2MiB system memory pages support on Pascal and newer
vkms:
- larger cursor support
i915:
- Rocketlake platform enablement
- Early DG1 enablement
- Numerous GEM refactorings
- DP MST fixes
- FBC, PSR, Cursor, Color, Gamma fixes
- TGL, RKL, EHL workaround updates
- TGL 8K display support fixes
- SDVO/HDMI/DVI fixes
amdgpu:
- Initial support for Sienna Cichlid GPU
- Initial support for Navy Flounder GPU
- SI UVD/VCE support
- expose rotation property
- Add support for unique id on Arcturus
- Enable runtime PM on vega10 boards that support BACO
- Skip BAR resizing if the bios already did id
- Major swSMU code cleanup
- Fixes for DCN bandwidth calculations
amdkfd:
- Track SDMA usage per process
- SMI events interface
radeon:
- Default to on chip GART for AGP boards on all arches
- Runtime PM reference count fixes
msm:
- headers regenerated causing churn
- a650/a640 display and GPU enablement
- dpu dither support for 6bpc panels
- dpu cursor fix
- dsi/mdp5 enablement for sdm630/sdm636/sdm66
tegra:
- video capture prep support
- reflection support
mediatek:
- convert mtk_dsi to bridge API
meson:
- FBC support
sun4i:
- iommu support
rockchip:
- register locking fix
- per-pixel alpha support PX30 VOP
mgag200:
- ported to simple and shmem helpers
- device init cleanups
- use managed pci functions
- dropped hw cursor support
ast:
- use managed pci functions
- use managed VRAM helpers
- rework cursor support
malidp:
- dev_groups support
hibmc:
- refactor hibmc_drv_vdac:
vc4:
- create TXP CRTC
imx:
- error path fixes and cleanups
etnaviv:
- clock handling and error handling cleanups
- use pin_user_pages"
* tag 'drm-next-2020-08-06' of git://anongit.freedesktop.org/drm/drm: (1747 commits)
drm/msm: use kthread_create_worker instead of kthread_run
drm/msm/mdp5: Add MDP5 configuration for SDM636/660
drm/msm/dsi: Add DSI configuration for SDM660
drm/msm/mdp5: Add MDP5 configuration for SDM630
drm/msm/dsi: Add phy configuration for SDM630/636/660
drm/msm/a6xx: add A640/A650 hwcg
drm/msm/a6xx: hwcg tables in gpulist
drm/msm/dpu: add SM8250 to hw catalog
drm/msm/dpu: add SM8150 to hw catalog
drm/msm/dpu: intf timing path for displayport
drm/msm/dpu: set missing flush bits for INTF_2 and INTF_3
drm/msm/dpu: don't use INTF_INPUT_CTRL feature on sdm845
drm/msm/dpu: move some sspp caps to dpu_caps
drm/msm/dpu: update UBWC config for sm8150 and sm8250
drm/msm/dpu: use right setup_blend_config for sm8150 and sm8250
drm/msm/a6xx: set ubwc config for A640 and A650
drm/msm/adreno: un-open-code some packets
drm/msm: sync generated headers
drm/msm/a6xx: add build_bw_table for A640/A650
drm/msm/a6xx: fix crashstate capture for A650
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 301 |
1 files changed, 212 insertions, 89 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 4433bda2ce25..5b220f2a7501 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -20,6 +20,21 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +/* To compile this assembly code: + * + * Navi1x: + * cpp -DASIC_TARGET_NAVI1X=1 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3 + * sp3-nv1x nv1x.sp3 -hex nv1x.hex + * + * Others: + * cpp -DASIC_TARGET_NAVI1X=0 cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3 + * sp3-gfx10 gfx10.sp3 -hex gfx10.hex + */ + +#define NO_SQC_STORE !ASIC_TARGET_NAVI1X + +var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised + var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 @@ -59,6 +74,8 @@ var SQ_WAVE_IB_STS_RCNT_SIZE = 6 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF +var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 + var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 @@ -96,17 +113,19 @@ var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 var s_save_status = ttmp12 -var s_save_trapsts = ttmp5 -var s_save_xnack_mask = ttmp6 +var s_save_trapsts = ttmp15 +var s_save_xnack_mask = s_save_trapsts var s_wave_size = ttmp7 var s_save_buf_rsrc0 = ttmp8 var s_save_buf_rsrc1 = ttmp9 var s_save_buf_rsrc2 = ttmp10 var s_save_buf_rsrc3 = ttmp11 -var s_save_mem_offset = ttmp14 +var s_save_mem_offset = ttmp4 var s_save_alloc_size = s_save_trapsts -var s_save_tmp = s_save_buf_rsrc2 -var s_save_m0 = ttmp15 +var s_save_tmp = ttmp14 +var s_save_m0 = ttmp5 +var s_save_ttmps_lo = s_save_tmp +var s_save_ttmps_hi = s_save_trapsts var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC @@ -128,23 +147,25 @@ var s_restore_spi_init_lo = exec_lo var s_restore_spi_init_hi = exec_hi var s_restore_mem_offset = ttmp12 var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 +var s_restore_tmp = ttmp2 var s_restore_mem_offset_save = s_restore_tmp var s_restore_m0 = s_restore_alloc_size var s_restore_mode = ttmp7 -var s_restore_flat_scratch = ttmp2 +var s_restore_flat_scratch = s_restore_tmp var s_restore_pc_lo = ttmp0 var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_status = ttmp14 +var s_restore_trapsts = ttmp15 var s_restore_xnack_mask = ttmp13 var s_restore_buf_rsrc0 = ttmp8 var s_restore_buf_rsrc1 = ttmp9 var s_restore_buf_rsrc2 = ttmp10 var s_restore_buf_rsrc3 = ttmp11 -var s_restore_size = ttmp7 +var s_restore_size = ttmp6 +var s_restore_ttmps_lo = s_restore_tmp +var s_restore_ttmps_hi = s_restore_alloc_size shader main asic(DEFAULT) @@ -159,6 +180,24 @@ L_JUMP_TO_RESTORE: L_SKIP_RESTORE: s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK + +if SINGLE_STEP_MISSED_WORKAROUND + // No single step exceptions if MODE.DEBUG_EN=0. + s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK + s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND + + // Second-level trap already handled exception if STATUS.HALT=1. + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Prioritize single step exception over context save. + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. + s_cbranch_scc0 L_FETCH_2ND_TRAP + +L_NO_SINGLE_STEP_WORKAROUND: +end + + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save s_cbranch_scc1 L_SAVE @@ -170,6 +209,8 @@ L_SKIP_RESTORE: s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK L_FETCH_2ND_TRAP: + +#if ASIC_TARGET_NAVI1X // Preserve and clear scalar XNACK state before issuing scalar loads. // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into // unused space ttmp11[31:24]. @@ -183,6 +224,7 @@ L_FETCH_2ND_TRAP: s_or_b32 ttmp11, ttmp11, ttmp3 s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +#endif // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) @@ -207,6 +249,7 @@ L_NO_NEXT_TRAP: L_EXCP_CASE: s_and_b32 ttmp1, ttmp1, 0xFFFF +#if ASIC_TARGET_NAVI1X // Restore SQ_WAVE_IB_STS. s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK @@ -214,6 +257,7 @@ L_EXCP_CASE: s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK s_or_b32 ttmp2, ttmp2, ttmp3 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +#endif // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 @@ -223,23 +267,11 @@ L_EXCP_CASE: s_rfe_b64 [ttmp0, ttmp1] L_SAVE: - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 - -L_NO_PC_REWIND: s_mov_b32 s_save_tmp, 0 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) +#if ASIC_TARGET_NAVI1X s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp @@ -253,6 +285,7 @@ L_NO_PC_REWIND: s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp +#endif /* inform SPI the readiness and wait for SPI's go signal */ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI @@ -261,12 +294,31 @@ L_NO_PC_REWIND: s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC +#if ASIC_TARGET_NAVI1X L_SLEEP: // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause // SQ hang, since the 7,8th wave could not get arbit to exec inst, while // other waves are stuck into the sleep-loop and waiting for wrexec!=0 s_sleep 0x2 s_cbranch_execz L_SLEEP +#else + s_waitcnt lgkmcnt(0) +#endif + + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_wave_size(s_save_ttmps_hi) + get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) + s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 + +#if ASIC_TARGET_NAVI1X + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 +#endif /* setup Resource Contants */ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo @@ -285,9 +337,45 @@ L_SLEEP: /* global mem offset */ s_mov_b32 s_save_mem_offset, 0x0 - s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) - s_lshl_b32 s_wave_size, s_wave_size, S_WAVE_SIZE - s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi, it's at bit25 + get_wave_size(s_wave_size) + +#if ASIC_TARGET_NAVI1X + // Save and clear vector XNACK state late to free up SGPRs. + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) + s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0 +#endif + + /* save first 4 VGPRs, needed for SGPR save */ + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_4VGPR_WAVE32 +L_ENABLE_SAVE_4VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + s_branch L_SAVE_4VGPR_WAVE64 +L_SAVE_4VGPR_WAVE32: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + s_branch L_SAVE_HWREG + +L_SAVE_4VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 /* save HW registers */ @@ -300,6 +388,13 @@ L_SAVE_HWREG: s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if NO_SQC_STORE + v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource + v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource + v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store + s_mov_b32 m0, 0x0 //Next lane of v2 to write to +#endif + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) @@ -307,8 +402,10 @@ L_SAVE_HWREG: write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) + + // Not used on Sienna_Cichlid but keep layout same for debugger. write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) @@ -320,10 +417,11 @@ L_SAVE_HWREG: s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - /* the first wave in the threadgroup */ - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] +#if NO_SQC_STORE + // Write HWREG/SGPRs with 32 VGPR lanes, wave32 is common case. + s_mov_b32 exec_hi, 0x0 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +#endif /* save SGPRs */ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... @@ -334,10 +432,14 @@ L_SAVE_HWREG: s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if NO_SQC_STORE + s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into +#else // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 +#endif s_mov_b32 m0, 0x0 //SGPR initial index value =0 s_nop 0x0 //Manually inserted wait states @@ -353,6 +455,18 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) + +#if NO_SQC_STORE + s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? + s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 + s_mov_b32 ttmp13, 0x0 + v_mov_b32 v2, 0x0 +L_SAVE_SGPR_SKIP_TCP_STORE: +#endif + s_add_u32 m0, m0, 16 //next sgpr index s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? @@ -366,43 +480,12 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) +#if NO_SQC_STORE + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +#else // restore s_save_buf_rsrc0,1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE - s_and_b32 m0, m0, 1 - s_cmp_eq_u32 m0, 1 - s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI - s_mov_b32 exec_hi, 0x00000000 - s_branch L_SAVE_4VGPR_WAVE32 -L_ENABLE_SAVE_4VGPR_EXEC_HI: - s_mov_b32 exec_hi, 0xFFFFFFFF - s_branch L_SAVE_4VGPR_WAVE64 -L_SAVE_4VGPR_WAVE32: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 - s_branch L_SAVE_LDS - -L_SAVE_4VGPR_WAVE64: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +#endif /* save LDS */ @@ -423,7 +506,7 @@ L_SAVE_LDS_NORMAL: s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE s_barrier //LDS is used? wait for other waves in the same TG - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_SAVE_LDS_DONE // first wave do LDS save; @@ -598,9 +681,7 @@ L_RESTORE: s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE //determine it is wave32 or wave64 - s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) - s_lshl_b32 s_restore_size, s_restore_size, S_WAVE_SIZE - s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size + get_wave_size(s_restore_size) s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_RESTORE_VGPR @@ -634,7 +715,7 @@ L_RESTORE_LDS_NORMAL: s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE s_and_b32 m0, m0, 1 s_cmp_eq_u32 m0, 1 s_mov_b32 m0, 0x0 @@ -842,38 +923,55 @@ L_RESTORE_HWREG: s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch - s_mov_b32 s_restore_tmp, s_restore_pc_hi - s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + +#if ASIC_TARGET_NAVI1X s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask +#endif + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK + + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 + s_waitcnt lgkmcnt(0) + +#if ASIC_TARGET_NAVI1X + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_mode, 0x0 - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_mov_b32 s_restore_tmp, 0x0 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_REPLAY_W64H_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp +#endif + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu @@ -887,32 +985,53 @@ L_END_PGM: end function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + v_writelane_b32 v2, s, m0 + s_add_u32 m0, m0, 0x1 +#else s_mov_b32 exec_lo, m0 s_mov_b32 m0, s_mem_offset s_buffer_store_dword s, s_rsrc, m0 glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo +#endif end function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], ttmp13 + s_add_u32 ttmp13, ttmp13, 0x1 + end +#else s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +#endif end function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], ttmp13 + s_add_u32 ttmp13, ttmp13, 0x1 + end +#else s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +#endif end - function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4 @@ -942,9 +1061,7 @@ end function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshr_b32 m0, s_size, S_WAVE_SIZE - s_and_b32 m0, m0, 1 - s_cmp_eq_u32 m0, 1 + s_bitcmp1_b32 s_size, S_WAVE_SIZE s_cbranch_scc1 L_ENABLE_SHIFT_W64 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) s_branch L_SHIFT_DONE @@ -965,3 +1082,9 @@ end function get_hwreg_size_bytes return 128 end + +function get_wave_size(s_reg) + s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE + s_or_b32 s_reg, s_save_spi_init_hi, s_reg //share with exec_hi, it's at bit25 +end |