diff options
author | Jay Cornwall <Jay.Cornwall@amd.com> | 2019-07-01 15:46:56 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2019-07-18 14:18:06 -0500 |
commit | 37f86a9b3617d55ad8189e1b7e6468b85dba4b88 (patch) | |
tree | a2525b2957285fbb0ecf091c41455148ec44c953 /drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | |
parent | 5ddd4a9a7c25a6a23a79f973e7a87b1403503719 (diff) |
drm/amdkfd: Merge gfx9/arcturus trap handlers, add ACC VGPR save
ACC VGPRs are a secondary VGPR set of same size as the primary VGPRs.
Save them as a block immediately following VGPRs.
Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 83 |
1 files changed, 81 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 6bae2e022c6e..871f2d431a44 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -197,13 +197,15 @@ var s_restore_spi_init_lo = exec_lo var s_restore_spi_init_hi = exec_hi var s_restore_mem_offset = ttmp12 +var s_restore_accvgpr_offset = ttmp13 var s_restore_alloc_size = ttmp3 var s_restore_tmp = ttmp2 var s_restore_mem_offset_save = s_restore_tmp //no conflict +var s_restore_accvgpr_offset_save = ttmp7 var s_restore_m0 = s_restore_alloc_size //no conflict -var s_restore_mode = ttmp7 +var s_restore_mode = s_restore_accvgpr_offset_save var s_restore_pc_lo = ttmp0 var s_restore_pc_hi = ttmp1 @@ -226,7 +228,7 @@ var s_restore_ttmps_hi = s_restore_alloc_size //no conflict /* Shader Main*/ shader main - asic(GFX9) + asic(DEFAULT) type(CS) @@ -791,10 +793,48 @@ end L_SAVE_VGPR_END: +if ASIC_TARGET_ARCTURUS + // Save ACC VGPRs + s_mov_b32 m0, 0x0 //VGPR initial index value =0 + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP +L_SAVE_ACCVGPR_LOOP_SQC: + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] + end + + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC + s_set_gpr_idx_off + s_branch L_SAVE_ACCVGPR_END +end +L_SAVE_ACCVGPR_LOOP: + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] + end + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP + s_set_gpr_idx_off + +L_SAVE_ACCVGPR_END: +end /* S_PGM_END_SAVED */ //FIXME graphics ONLY if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) @@ -921,6 +961,11 @@ end s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + +if ASIC_TARGET_ARCTURUS + s_mov_b32 s_restore_accvgpr_offset, s_restore_buf_rsrc2 //ACC VGPRs at end of VGPRs +end + if (SWIZZLE_EN) s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? else @@ -958,6 +1003,10 @@ else // VGPR load using dw burst s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +if ASIC_TARGET_ARCTURUS + s_mov_b32 s_restore_accvgpr_offset_save, s_restore_accvgpr_offset + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 +end s_mov_b32 m0, 4 //VGPR initial index value = 1 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later @@ -966,6 +1015,20 @@ else if(USE_MTBUF_INSTEAD_OF_MUBUF) tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 else + +if ASIC_TARGET_ARCTURUS + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*3 + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 + s_waitcnt vmcnt(0) + + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_write acc[vgpr], v[vgpr] + end +end + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 @@ -982,6 +1045,18 @@ else s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? s_set_gpr_idx_off /* VGPR restore on v0 */ +if ASIC_TARGET_ARCTURUS + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*3 + s_waitcnt vmcnt(0) + + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_write acc[vgpr], v[vgpr] + end +end + if(USE_MTBUF_INSTEAD_OF_MUBUF) tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 else @@ -1202,6 +1277,10 @@ function get_vgpr_size_bytes(s_vgpr_size_byte) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible + +if ASIC_TARGET_ARCTURUS + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, 1 // Double size for ACC VGPRs +end end function get_sgpr_size_bytes(s_sgpr_size_byte) |