diff options
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 365 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 81 |
2 files changed, 320 insertions, 126 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index c45ba0013ca5..7274baff5c16 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -274,7 +274,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf82015e, + 0xbf820001, 0xbf820241, 0xb8f8f802, 0x89788678, 0xb8fbf803, 0x866eff7b, 0x00000400, 0xbf85003b, @@ -404,15 +404,57 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbefe00c1, 0xbeff00c1, 0xbee80080, 0xbee90080, 0xbef600ff, 0x01000000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf85004d, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0xbefe00c1, 0xbeff00c1, 0xb8fb4306, 0x867bc17b, - 0xbf84002c, 0xbf8a0000, + 0xbf840063, 0xbf8a0000, 0x867aff6f, 0x04000000, - 0xbf840028, 0x8e7b867b, + 0xbf84005f, 0x8e7b867b, 0x8e7b827b, 0xbef6007b, 0xb8f02a05, 0x80708170, 0x8e708a70, 0xb8fa1605, @@ -422,144 +464,215 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0x01000000, 0xbefc0080, 0xd28c0002, 0x000100c1, 0xd28d0003, 0x000204c1, - 0xd1060002, 0x00011103, - 0x7e0602ff, 0x00000200, - 0xbefc00ff, 0x00010000, - 0xbe800077, 0x8677ff77, - 0xff7fffff, 0x8777ff77, - 0x00058000, 0xd8ec0000, - 0x00000002, 0xbf8cc07f, - 0xe0765000, 0x701d0002, - 0x68040702, 0xd0c9006a, - 0x0000f702, 0xbf87fff7, - 0xbef70000, 0xbef000ff, - 0x00000400, 0xbefe00c1, - 0xbeff00c1, 0xb8fb2a05, - 0x807b817b, 0x8e7b827b, - 0x8e76887b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf840015, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xbf8200da, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x866eff7f, 0x08000000, - 0x8f6e836e, 0x87776e77, - 0x866eff7f, 0x70000000, - 0x8f6e816e, 0x87776e77, - 0x866eff7f, 0x04000000, - 0xbf84001e, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf840019, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbef80080, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850030, 0x24040682, + 0xd86e4000, 0x00000002, + 0xbf8cc07f, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x680404ff, + 0x00000200, 0xd0c9006a, + 0x0000f702, 0xbf87ffd2, + 0xbf820015, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe800077, + 0x8677ff77, 0xff7fffff, + 0x8777ff77, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8cc07f, 0xe0765000, + 0x701d0002, 0x68040702, + 0xd0c9006a, 0x0000f702, + 0xbf87fff7, 0xbef70000, + 0xbef000ff, 0x00000400, 0xbefe00c1, 0xbeff00c1, - 0xb8ef2a05, 0x806f816f, - 0x8e6f826f, 0x8e76886f, + 0xb8fb2a05, 0x807b817b, + 0x8e7b827b, 0x8e76887b, 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefc0084, - 0xbf11087c, 0x806fff6f, - 0x00008000, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850051, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, 0x7e000300, 0x7e020301, 0x7e040302, 0x7e060303, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffee, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200da, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x866eff7f, + 0x08000000, 0x8f6e836e, + 0x87776e77, 0x866eff7f, + 0x70000000, 0x8f6e816e, + 0x87776e77, 0x866eff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef4306, 0x866fc16f, + 0xbf840019, 0x8e6f866f, + 0x8e6f826f, 0xbef6006f, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, + 0xbef80080, 0xbefe00c1, + 0xbeff00c1, 0xb8ef2a05, + 0x806f816f, 0x8e6f826f, + 0x8e76886f, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x806fff6f, 0x00008000, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, 0x80786e78, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xc0211bfa, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0xbef60084, + 0xbef600ff, 0x01000000, + 0xc0211bfa, 0x00000078, + 0x80788478, 0xc0211b3a, 0x00000078, 0x80788478, - 0xc0211b3a, 0x00000078, - 0x80788478, 0xc0211b7a, + 0xc0211b7a, 0x00000078, + 0x80788478, 0xc0211c3a, 0x00000078, 0x80788478, - 0xc0211c3a, 0x00000078, - 0x80788478, 0xc0211c7a, + 0xc0211c7a, 0x00000078, + 0x80788478, 0xc0211eba, 0x00000078, 0x80788478, - 0xc0211eba, 0x00000078, - 0x80788478, 0xc0211efa, + 0xc0211efa, 0x00000078, + 0x80788478, 0xc0211a3a, 0x00000078, 0x80788478, - 0xc0211a3a, 0x00000078, - 0x80788478, 0xc0211a7a, + 0xc0211a7a, 0x00000078, + 0x80788478, 0xc0211cfa, 0x00000078, 0x80788478, - 0xc0211cfa, 0x00000078, - 0x80788478, 0xbf8cc07f, - 0xbefc006f, 0xbefe0070, - 0xbeff0071, 0x866f7bff, - 0x000003ff, 0xb96f4803, - 0x866f7bff, 0xfffff800, - 0x8f6f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x866fff6d, - 0xf8000000, 0x8f6f9b6f, - 0x8e6f906f, 0xbeee0080, - 0x876e6f6e, 0x866fff6d, - 0x04000000, 0x8f6f9a6f, - 0x8e6f8f6f, 0x876e6f6e, - 0x866fff7a, 0x00800000, - 0x8f6f976f, 0xb96ef807, - 0x866dff6d, 0x0000ffff, - 0x86fe7e7e, 0x86ea6a6a, - 0x8f6e837a, 0xb96ee0c2, - 0xbf800002, 0xb97a0002, - 0xbf8a0000, 0x95806f6c, - 0xbf810000, 0x00000000, + 0xbf8cc07f, 0xbefc006f, + 0xbefe0070, 0xbeff0071, + 0x866f7bff, 0x000003ff, + 0xb96f4803, 0x866f7bff, + 0xfffff800, 0x8f6f8b6f, + 0xb96fa2c3, 0xb973f801, + 0xb8ee2a05, 0x806e816e, + 0x8e6e8a6e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x866fff6d, 0xf8000000, + 0x8f6f9b6f, 0x8e6f906f, + 0xbeee0080, 0x876e6f6e, + 0x866fff6d, 0x04000000, + 0x8f6f9a6f, 0x8e6f8f6f, + 0x876e6f6e, 0x866fff7a, + 0x00800000, 0x8f6f976f, + 0xb96ef807, 0x866dff6d, + 0x0000ffff, 0x86fe7e7e, + 0x86ea6a6a, 0x8f6e837a, + 0xb96ee0c2, 0xbf800002, + 0xb97a0002, 0xbf8a0000, + 0x95806f6c, 0xbf810000, }; static const uint32_t cwsr_trap_gfx10_hex[] = { diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 871f2d431a44..4d146bca0b05 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -94,6 +94,7 @@ var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency +var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger /**************************************************************************/ /* variables */ @@ -107,6 +108,7 @@ var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 +var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 @@ -127,6 +129,7 @@ var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME @@ -584,6 +587,16 @@ if G8SR_VGPR_SR_IN_DWX4 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes else +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP + + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + s_branch L_SAVE_LDS + +L_SAVE_FIRST_VGPRS_WITH_TCP: +end + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 @@ -683,6 +696,27 @@ elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss else // BUFFER_STORE v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_LDS_WITH_TCP + + v_lshlrev_b32 v2, 2, v3 +L_SAVE_LDS_LOOP_SQC: + ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 + s_waitcnt lgkmcnt(0) + + write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) + + v_add_u32 v2, 0x200, v2 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_WITH_TCP: +end + v_mul_i32_i24 v2, v3, 8 // tid*8 v_mov_b32 v3, 256*2 s_mov_b32 m0, 0x10000 @@ -769,6 +803,21 @@ else s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_VGPR_LOOP + +L_SAVE_VGPR_LOOP_SQC: + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC + + s_set_gpr_idx_off + s_branch L_SAVE_VGPR_END +end + L_SAVE_VGPR_LOOP: v_mov_b32 v0, v0 //v0 = v[0+m0] v_mov_b32 v1, v1 //v0 = v[0+m0] @@ -1265,7 +1314,39 @@ function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*16 end +function check_if_tcp_store_ok + // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. + s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK + s_cbranch_scc1 L_TCP_STORE_CHECK_DONE + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp + +L_TCP_STORE_CHECK_DONE: +end + +function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset) + s_mov_b32 s4, 0 +L_WRITE_VGPR_LANE_LOOP: + for var lane = 0; lane < 4; ++ lane + v_readlane_b32 s[lane], v, s4 + s_add_u32 s4, s4, 1 + end + + s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 + ack_sqc_store_workaround() + + s_add_u32 s_mem_offset, s_mem_offset, 0x10 + s_cmp_eq_u32 s4, 0x40 + s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP +end + +function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset) + for var vgpr = 0; vgpr < n_vgprs; ++ vgpr + write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset) + end +end function get_lds_size_bytes(s_lds_size_byte) // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW |