diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
47 files changed, 7380 insertions, 2483 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index a5ae7bcf44eb..0d3d8972240d 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -38,6 +38,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v11.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v12.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_packet_manager_vi.o \ @@ -49,6 +50,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v11.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v12.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index d1caaf0e6a7c..0320163b6e74 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -274,7 +274,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf820258, + 0xbf820001, 0xbf820259, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -390,141 +390,98 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0xb8fb1605, 0x807b817b, - 0x8e7b847b, 0x8e76827b, - 0xbef600ff, 0x01000000, - 0xbef20174, 0x80747074, - 0x82758075, 0xbefc0080, - 0xbf800000, 0xbe802b00, - 0xbe822b02, 0xbe842b04, - 0xbe862b06, 0xbe882b08, - 0xbe8a2b0a, 0xbe8c2b0c, - 0xbe8e2b0e, 0xc06b003a, - 0x00000000, 0xbf8cc07f, - 0xc06b013a, 0x00000010, - 0xbf8cc07f, 0xc06b023a, - 0x00000020, 0xbf8cc07f, - 0xc06b033a, 0x00000030, - 0xbf8cc07f, 0x8074c074, - 0x82758075, 0x807c907c, - 0xbf0a7b7c, 0xbf85ffe7, - 0xbef40172, 0xbef00080, - 0xbefe00c1, 0xbeff00c1, - 0xbee80080, 0xbee90080, - 0xbef600ff, 0x01000000, - 0x867aff78, 0x00400000, - 0xbf850003, 0xb8faf803, - 0x897a7aff, 0x10000000, - 0xbf85004d, 0xbe840080, - 0xd2890000, 0x00000900, - 0x80048104, 0xd2890001, - 0x00000900, 0x80048104, - 0xd2890002, 0x00000900, - 0x80048104, 0xd2890003, - 0x00000900, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8fb1605, + 0x807b817b, 0x8e7b847b, + 0x8e76827b, 0xbef600ff, + 0x01000000, 0xbef20174, + 0x80747074, 0x82758075, + 0xbefc0080, 0xbf800000, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003a, 0x00000000, + 0xbf8cc07f, 0xc06b013a, + 0x00000010, 0xbf8cc07f, + 0xc06b023a, 0x00000020, + 0xbf8cc07f, 0xc06b033a, + 0x00000030, 0xbf8cc07f, + 0x8074c074, 0x82758075, + 0x807c907c, 0xbf0a7b7c, + 0xbf85ffe7, 0xbef40172, + 0xbef00080, 0xbefe00c1, + 0xbeff00c1, 0xbee80080, + 0xbee90080, 0xbef600ff, + 0x01000000, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf85004d, 0xbe840080, 0xd2890000, - 0x00000901, 0x80048104, - 0xd2890001, 0x00000901, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, 0x80048104, 0xd2890002, - 0x00000901, 0x80048104, - 0xd2890003, 0x00000901, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000902, + 0xd2890000, 0x00000901, 0x80048104, 0xd2890001, - 0x00000902, 0x80048104, - 0xd2890002, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, 0x80048104, 0xd2890003, - 0x00000902, 0x80048104, + 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000903, 0x80048104, - 0xd2890001, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, 0x80048104, 0xd2890002, - 0x00000903, 0x80048104, - 0xd2890003, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbf820008, - 0xe0724000, 0x701d0000, - 0xe0724100, 0x701d0100, - 0xe0724200, 0x701d0200, - 0xe0724300, 0x701d0300, - 0xbefe00c1, 0xbeff00c1, - 0xb8fb4306, 0x867bc17b, - 0xbf840063, 0xbf8a0000, - 0x867aff6f, 0x04000000, - 0xbf84005f, 0x8e7b867b, - 0x8e7b827b, 0xbef6007b, - 0xb8f02a05, 0x80708170, - 0x8e708a70, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, - 0x867aff78, 0x00400000, - 0xbf850003, 0xb8faf803, - 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, - 0xd2890000, 0x00000900, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, 0x80048104, 0xd2890001, - 0x00000900, 0x80048104, - 0xd2890002, 0x00000900, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, 0x80048104, 0xd2890003, - 0x00000900, 0x80048104, + 0x00000903, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000901, 0x80048104, - 0xd2890001, 0x00000901, - 0x80048104, 0xd2890002, - 0x00000901, 0x80048104, - 0xd2890003, 0x00000901, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8fb2a05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xbf820008, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0xbefe00c1, + 0xbeff00c1, 0xb8fb4306, + 0x867bc17b, 0xbf840063, + 0xbf8a0000, 0x867aff6f, + 0x04000000, 0xbf84005f, + 0x8e7b867b, 0x8e7b827b, + 0xbef6007b, 0xb8f02a05, + 0x80708170, 0x8e708a70, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -544,141 +501,185 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850051, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xbf8200c7, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x866eff7f, 0x04000000, - 0xbf84001e, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf840019, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbefe00c1, - 0xbeff00c1, 0xbef600ff, - 0x01000000, 0xb8ef2a05, - 0x806f816f, 0x8e6f826f, - 0x806fff6f, 0x00008000, - 0xbef80080, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x6e1d0000, 0xe0524100, - 0x6e1d0100, 0xe0524200, - 0x6e1d0200, 0xe0524300, - 0x6e1d0300, 0xbf8c0f70, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200c7, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x866eff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef4306, 0x866fc16f, + 0xbf840019, 0x8e6f866f, + 0x8e6f826f, 0xbef6006f, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, + 0xbefe00c1, 0xbeff00c1, 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x6e1d0000, + 0xe0524100, 0x6e1d0100, + 0xe0524200, 0x6e1d0200, + 0xe0524300, 0x6e1d0300, + 0xbf8c0f70, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, 0x80786e78, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xc0211bfa, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0xbef60084, + 0xbef600ff, 0x01000000, + 0xc0211bfa, 0x00000078, + 0x80788478, 0xc0211b3a, 0x00000078, 0x80788478, - 0xc0211b3a, 0x00000078, - 0x80788478, 0xc0211b7a, + 0xc0211b7a, 0x00000078, + 0x80788478, 0xc0211c3a, 0x00000078, 0x80788478, - 0xc0211c3a, 0x00000078, - 0x80788478, 0xc0211c7a, + 0xc0211c7a, 0x00000078, + 0x80788478, 0xc0211eba, 0x00000078, 0x80788478, - 0xc0211eba, 0x00000078, - 0x80788478, 0xc0211efa, + 0xc0211efa, 0x00000078, + 0x80788478, 0xc0211a3a, 0x00000078, 0x80788478, - 0xc0211a3a, 0x00000078, - 0x80788478, 0xc0211a7a, + 0xc0211a7a, 0x00000078, + 0x80788478, 0xc0211cfa, 0x00000078, 0x80788478, - 0xc0211cfa, 0x00000078, - 0x80788478, 0xbf8cc07f, - 0xbefc006f, 0xbefe0070, - 0xbeff0071, 0x866f7bff, - 0x000003ff, 0xb96f4803, - 0x866f7bff, 0xfffff800, - 0x8f6f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xbf8cc07f, 0xbefc006f, + 0xbefe0070, 0xbeff0071, + 0x866f7bff, 0x000003ff, + 0xb96f4803, 0x866f7bff, + 0xfffff800, 0x8f6f8b6f, + 0xb96fa2c3, 0xb973f801, + 0xb8ee2a05, 0x806e816e, + 0x8e6e8a6e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; static const uint32_t cwsr_trap_nv1x_hex[] = { - 0xbf820001, 0xbf8201f5, + 0xbf820001, 0xbf820393, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -711,19 +712,19 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c0000, 0x8f6e976e, 0x8a77ff77, 0x00800000, 0x88776e77, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, + 0xfa000000, 0xbf8c0000, 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c0000, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff0000, - 0xbf850005, 0x8878ff78, - 0x00002000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x01000000, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff0000, + 0xbf850008, 0x876eff6d, + 0x01000000, 0xbf850007, + 0x8878ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x0000ffff, 0x907a8977, 0x877bff7a, 0x003f8000, @@ -769,13 +770,90 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820002, 0xbeff03c1, - 0xbf82000b, 0xbef603ff, - 0x01000000, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, - 0x705d0300, 0xbf82000a, - 0xbef603ff, 0x01000000, + 0xbf820058, 0xbef603ff, + 0x01000000, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf850049, 0xbe840380, + 0xd7600000, 0x00000900, + 0x80048104, 0xd7600001, + 0x00000900, 0x80048104, + 0xd7600002, 0x00000900, + 0x80048104, 0xd7600003, + 0x00000900, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06a004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000901, + 0x80048104, 0xd7600001, + 0x00000901, 0x80048104, + 0xd7600002, 0x00000901, + 0x80048104, 0xd7600003, + 0x00000901, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06a004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000902, + 0x80048104, 0xd7600001, + 0x00000902, 0x80048104, + 0xd7600002, 0x00000902, + 0x80048104, 0xd7600003, + 0x00000902, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06a004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000903, + 0x80048104, 0xd7600001, + 0x00000903, 0x80048104, + 0xd7600002, 0x00000903, + 0x80048104, 0xd7600003, + 0x00000903, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06a004, + 0xbf84ffef, 0xbf820060, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0xbf820057, 0xbef603ff, + 0x01000000, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf850049, 0xbe840380, + 0xd7600000, 0x00000900, + 0x80048104, 0xd7600001, + 0x00000900, 0x80048104, + 0xd7600002, 0x00000900, + 0x80048104, 0xd7600003, + 0x00000900, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000901, + 0x80048104, 0xd7600001, + 0x00000901, 0x80048104, + 0xd7600002, 0x00000901, + 0x80048104, 0xd7600003, + 0x00000901, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000902, + 0x80048104, 0xd7600001, + 0x00000902, 0x80048104, + 0xd7600002, 0x00000902, + 0x80048104, 0xd7600003, + 0x00000902, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000903, + 0x80048104, 0xd7600001, + 0x00000903, 0x80048104, + 0xd7600002, 0x00000903, + 0x80048104, 0xd7600003, + 0x00000903, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbf820008, 0xe0704000, 0x705d0000, 0xe0704100, 0x705d0100, 0xe0704200, 0x705d0200, @@ -855,250 +933,377 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840044, 0xbf8a0000, + 0xbf840085, 0xbf8a0000, 0x877aff6d, 0x80000000, - 0xbf840040, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, - 0x8070ff70, 0x00000080, - 0xbef603ff, 0x01000000, - 0xd7650000, 0x000100c1, - 0xd7660000, 0x000200c1, - 0x16000084, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850012, - 0xbe8303ff, 0x00000080, - 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, + 0xbf840081, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef603ff, + 0x01000000, 0xd7650000, + 0x000100c1, 0xd7660000, + 0x000200c1, 0x16000084, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850033, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf85001d, 0xd8d80000, 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, + 0xbe840380, 0xd7600000, + 0x00000901, 0x80048104, + 0xd7600001, 0x00000901, + 0x80048104, 0xd7600002, + 0x00000901, 0x80048104, + 0xd7600003, 0x00000901, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807cff7c, 0x00000080, 0xd5250000, 0x0001ff00, 0x00000080, 0xbf0a7b7c, - 0xbf85fff4, 0xbf820011, - 0xbe8303ff, 0x00000100, + 0xbf85ffe4, 0xbf820044, + 0xbe8303ff, 0x00000080, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, 0xe0704000, 0x705d0100, 0x807c037c, 0x80700370, 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85fff4, 0xbefe03c1, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850004, - 0xbef003ff, 0x00000200, - 0xbeff0380, 0xbf820003, - 0xbef003ff, 0x00000400, - 0xbeff03c1, 0xb97b3a05, - 0x807b817b, 0x8f7b827b, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850017, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff4, 0xbf820032, + 0xb97af803, 0x8a7a7aff, + 0x10000000, 0xbf85001d, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xbe840380, + 0xd7600000, 0x00000901, + 0x80048104, 0xd7600001, + 0x00000901, 0x80048104, + 0xd7600002, 0x00000901, + 0x80048104, 0xd7600003, + 0x00000901, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0x807cff7c, + 0x00000100, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a7b7c, 0xbf85ffe4, + 0xbf820011, 0xbe8303ff, + 0x00000100, 0xbf800000, + 0xbf800000, 0xbf800000, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a7b7c, 0xbf85fff4, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x00000200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x00000400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf85006b, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf8400fa, + 0xb97af803, 0x8a7a7aff, + 0x10000000, 0xbf850050, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xbe840380, 0xd7600000, + 0x00000900, 0x80048104, + 0xd7600001, 0x00000900, + 0x80048104, 0xd7600002, + 0x00000900, 0x80048104, + 0xd7600003, 0x00000900, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000901, 0x80048104, + 0xd7600001, 0x00000901, + 0x80048104, 0xd7600002, + 0x00000901, 0x80048104, + 0xd7600003, 0x00000901, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000902, 0x80048104, + 0xd7600001, 0x00000902, + 0x80048104, 0xd7600002, + 0x00000902, 0x80048104, + 0xd7600003, 0x00000902, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000903, 0x80048104, + 0xd7600001, 0x00000903, + 0x80048104, 0xd7600002, + 0x00000903, 0x80048104, + 0xd7600003, 0x00000903, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffb1, 0xbf8200a6, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000200, 0xbf0a7b7c, + 0xbf85ffef, 0xbf820094, 0xbef603ff, 0x01000000, 0xbefc0384, 0xbf0a7b7c, - 0xbf840037, 0x7e008700, + 0xbf840065, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf850050, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xbe840380, + 0xd7600000, 0x00000900, + 0x80048104, 0xd7600001, + 0x00000900, 0x80048104, + 0xd7600002, 0x00000900, + 0x80048104, 0xd7600003, + 0x00000900, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000901, + 0x80048104, 0xd7600001, + 0x00000901, 0x80048104, + 0xd7600002, 0x00000901, + 0x80048104, 0xd7600003, + 0x00000901, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000902, + 0x80048104, 0xd7600001, + 0x00000902, 0x80048104, + 0xd7600002, 0x00000902, + 0x80048104, 0xd7600003, + 0x00000902, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0xbe840380, + 0xd7600000, 0x00000903, + 0x80048104, 0xd7600001, + 0x00000903, 0x80048104, + 0xd7600002, 0x00000903, + 0x80048104, 0xd7600003, + 0x00000903, 0x80048104, + 0xf469003a, 0xe0000000, + 0x80709070, 0xbf06c004, + 0xbf84ffef, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf82003b, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, 0x705d0300, 0x807c847c, - 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000400, 0xbf0a7b7c, 0xbf85ffef, - 0xbf820025, 0xbef603ff, - 0x01000000, 0xbefc0384, - 0xbf0a7b7c, 0xbf840011, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, - 0xe0704000, 0x705d0000, - 0xe0704100, 0x705d0100, - 0xe0704200, 0x705d0200, - 0xe0704300, 0x705d0300, - 0x807c847c, 0x8070ff70, - 0x00000400, 0xbf0a7b7c, - 0xbf85ffef, 0xb97b1e06, - 0x877bc17b, 0xbf84000c, - 0x8f7b837b, 0x807b7c7b, - 0xbefe03c1, 0xbeff0380, + 0xb97b1e06, 0x877bc17b, + 0xbf840027, 0x8f7b837b, + 0x807b7c7b, 0xbefe03c1, + 0xbeff0380, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf850017, 0x7e008700, + 0xbe840380, 0xd7600000, + 0x00000900, 0x80048104, + 0xd7600001, 0x00000900, + 0x80048104, 0xd7600002, + 0x00000900, 0x80048104, + 0xd7600003, 0x00000900, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06c004, 0xbf84ffef, + 0x807c817c, 0xbf0a7b7c, + 0xbf85ffea, 0xbf820008, 0x7e008700, 0xe0704000, 0x705d0000, 0x807c817c, 0x8070ff70, 0x00000080, 0xbf0a7b7c, 0xbf85fff8, - 0xbf820144, 0xbef4037e, + 0xbf82013f, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, 0x10807fac, 0xb97202dc, 0x8f729972, 0x876eff7f, - 0x04000000, 0xbf840034, + 0x04000000, 0xbf840033, 0xbefe03c1, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb96f4306, 0x876fc16f, - 0xbf840029, 0x8f6f866f, - 0x8f6f826f, 0xbef6036f, - 0xb9783a05, 0x80788178, - 0xbf0d9972, 0xbf850002, - 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb96e1e06, - 0x8f6e8a6e, 0x80786e78, - 0x8078ff78, 0x00000200, - 0x8078ff78, 0x00000080, - 0xbef603ff, 0x01000000, - 0x907c9972, 0x877c817c, - 0xbf06817c, 0xbefc0380, - 0xbf850009, 0xe0310000, - 0x781d0000, 0x807cff7c, - 0x00000080, 0x8078ff78, - 0x00000080, 0xbf0a6f7c, - 0xbf85fff8, 0xbf820008, + 0xbf840028, 0x8f6f886f, + 0xbef6036f, 0xb9783a05, + 0x80788178, 0xbf0d9972, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef603ff, + 0x01000000, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850009, 0xe0310000, 0x781d0000, - 0x807cff7c, 0x00000100, - 0x8078ff78, 0x00000100, + 0x807cff7c, 0x00000080, + 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff8, - 0xbef80380, 0xbefe03c1, - 0x907c9972, 0x877c817c, - 0xbf06817c, 0xbf850002, - 0xbeff0380, 0xbf820001, - 0xbeff03c1, 0xb96f3a05, - 0x806f816f, 0x8f6f826f, - 0x907c9972, 0x877c817c, - 0xbf06817c, 0xbf850024, - 0xbef603ff, 0x01000000, - 0xbeee0378, 0x8078ff78, - 0x00000200, 0xbefc0384, - 0xbf0a6f7c, 0xbf840050, - 0xe0304000, 0x785d0000, - 0xe0304080, 0x785d0100, - 0xe0304100, 0x785d0200, - 0xe0304180, 0x785d0300, - 0xbf8c3f70, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807c847c, + 0xbf820008, 0xe0310000, + 0x781d0000, 0x807cff7c, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fff8, 0xbef80380, + 0xbefe03c1, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f3a05, 0x806f816f, + 0x8f6f826f, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850024, 0xbef603ff, + 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000200, - 0xbf0a6f7c, 0xbf85ffee, - 0xe0304000, 0x6e5d0000, - 0xe0304080, 0x6e5d0100, - 0xe0304100, 0x6e5d0200, - 0xe0304180, 0x6e5d0300, - 0xbf8c3f70, 0xbf820034, - 0xbef603ff, 0x01000000, - 0xbeee0378, 0x8078ff78, - 0x00000400, 0xbefc0384, - 0xbf0a6f7c, 0xbf840012, - 0xe0304000, 0x785d0000, - 0xe0304100, 0x785d0100, - 0xe0304200, 0x785d0200, - 0xe0304300, 0x785d0300, - 0xbf8c3f70, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807c847c, + 0xbefc0384, 0xbf0a6f7c, + 0xbf840050, 0xe0304000, + 0x785d0000, 0xe0304080, + 0x785d0100, 0xe0304100, + 0x785d0200, 0xe0304180, + 0x785d0300, 0xbf8c0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807c847c, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85ffee, 0xe0304000, + 0x6e5d0000, 0xe0304080, + 0x6e5d0100, 0xe0304100, + 0x6e5d0200, 0xe0304180, + 0x6e5d0300, 0xbf8c0000, + 0xbf820034, 0xbef603ff, + 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xb96f1e06, 0x876fc16f, - 0xbf84000e, 0x8f6f836f, - 0x806f7c6f, 0xbefe03c1, - 0xbeff0380, 0xe0304000, - 0x785d0000, 0xbf8c3f70, - 0x7e008500, 0x807c817c, - 0x8078ff78, 0x00000080, - 0xbf0a6f7c, 0xbf85fff7, - 0xbeff03c1, 0xe0304000, - 0x6e5d0000, 0xe0304100, - 0x6e5d0100, 0xe0304200, - 0x6e5d0200, 0xe0304300, - 0x6e5d0300, 0xbf8c3f70, + 0xbefc0384, 0xbf0a6f7c, + 0xbf840012, 0xe0304000, + 0x785d0000, 0xe0304100, + 0x785d0100, 0xe0304200, + 0x785d0200, 0xe0304300, + 0x785d0300, 0xbf8c0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xb96f1e06, + 0x876fc16f, 0xbf84000e, + 0x8f6f836f, 0x806f7c6f, + 0xbefe03c1, 0xbeff0380, + 0xe0304000, 0x785d0000, + 0xbf8c0000, 0x7e008500, + 0x807c817c, 0x8078ff78, + 0x00000080, 0xbf0a6f7c, + 0xbf85fff7, 0xbeff03c1, + 0xe0304000, 0x6e5d0000, + 0xe0304100, 0x6e5d0100, + 0xe0304200, 0x6e5d0200, + 0xe0304300, 0x6e5d0300, + 0xbf8c0000, 0xb9783a05, + 0x80788178, 0xbf0d9972, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x80f8ff78, + 0x00000050, 0xbef603ff, + 0x01000000, 0xbefc03ff, + 0x0000006c, 0x80f89078, + 0xf429003a, 0xf0000000, + 0xbf8c0000, 0x80fc847c, + 0xbf800000, 0xbe803100, + 0xbe823102, 0x80f8a078, + 0xf42d003a, 0xf0000000, + 0xbf8c0000, 0x80fc887c, + 0xbf800000, 0xbe803100, + 0xbe823102, 0xbe843104, + 0xbe863106, 0x80f8c078, + 0xf431003a, 0xf0000000, + 0xbf8c0000, 0x80fc907c, + 0xbf800000, 0xbe803100, + 0xbe823102, 0xbe843104, + 0xbe863106, 0xbe883108, + 0xbe8a310a, 0xbe8c310c, + 0xbe8e310e, 0xbf06807c, + 0xbf84fff0, 0xba80f801, + 0x00000000, 0xbf8a0000, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, 0x8f788a78, 0xb96e1e06, 0x8f6e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, - 0x80f8ff78, 0x00000050, 0xbef603ff, 0x01000000, - 0xbefc03ff, 0x0000006c, - 0x80f89078, 0xf429003a, - 0xf0000000, 0xbf8cc07f, - 0x80fc847c, 0xbf800000, - 0xbe803100, 0xbe823102, - 0x80f8a078, 0xf42d003a, - 0xf0000000, 0xbf8cc07f, - 0x80fc887c, 0xbf800000, - 0xbe803100, 0xbe823102, - 0xbe843104, 0xbe863106, - 0x80f8c078, 0xf431003a, - 0xf0000000, 0xbf8cc07f, - 0x80fc907c, 0xbf800000, - 0xbe803100, 0xbe823102, - 0xbe843104, 0xbe863106, - 0xbe883108, 0xbe8a310a, - 0xbe8c310c, 0xbe8e310e, - 0xbf06807c, 0xbf84fff0, - 0xba80f801, 0x00000000, - 0xbf8a0000, 0xb9783a05, - 0x80788178, 0xbf0d9972, - 0xbf850002, 0x8f788978, - 0xbf820001, 0x8f788a78, - 0xb96e1e06, 0x8f6e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0xbef603ff, - 0x01000000, 0xf4211bfa, + 0xf4211bfa, 0xf0000000, + 0x80788478, 0xf4211b3a, 0xf0000000, 0x80788478, - 0xf4211b3a, 0xf0000000, - 0x80788478, 0xf4211b7a, + 0xf4211b7a, 0xf0000000, + 0x80788478, 0xf4211c3a, 0xf0000000, 0x80788478, - 0xf4211c3a, 0xf0000000, - 0x80788478, 0xf4211c7a, + 0xf4211c7a, 0xf0000000, + 0x80788478, 0xf4211eba, 0xf0000000, 0x80788478, - 0xf4211eba, 0xf0000000, - 0x80788478, 0xf4211efa, + 0xf4211efa, 0xf0000000, + 0x80788478, 0xf4211e7a, 0xf0000000, 0x80788478, - 0xf4211e7a, 0xf0000000, - 0x80788478, 0xf4211cfa, + 0xf4211cfa, 0xf0000000, + 0x80788478, 0xf4211bba, 0xf0000000, 0x80788478, + 0xbf8c0000, 0xb9eef814, 0xf4211bba, 0xf0000000, - 0x80788478, 0xbf8cc07f, - 0xb9eef814, 0xf4211bba, - 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef815, - 0xbefc036f, 0xbefe0370, - 0xbeff0371, 0x876f7bff, - 0x000003ff, 0xb9ef4803, - 0xb9f9f816, 0x876f7bff, - 0xfffff800, 0x906f8b6f, - 0xb9efa2c3, 0xb9f3f801, - 0xb96e3a05, 0x806e816e, - 0xbf0d9972, 0xbf850002, - 0x8f6e896e, 0xbf820001, - 0x8f6e8a6e, 0xb96f1e06, - 0x8f6f8a6f, 0x806e6f6e, - 0x806eff6e, 0x00000200, - 0x806e746e, 0x826f8075, - 0x876fff6f, 0x0000ffff, - 0xf4091c37, 0xfa000050, - 0xf4091d37, 0xfa000060, - 0xf4011e77, 0xfa000074, - 0xbf8cc07f, 0x906e8977, - 0x876fff6e, 0x003f8000, - 0x906e8677, 0x876eff6e, - 0x02000000, 0x886e6f6e, - 0xb9eef807, 0x876dff6d, - 0x0000ffff, 0x87fe7e7e, - 0x87ea6a6a, 0xb9faf802, - 0xbe80226c, 0xbf9b0000, + 0x80788478, 0xbf8c0000, + 0xb9eef815, 0xbefc036f, + 0xbefe0370, 0xbeff0371, + 0xb9f9f816, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, + 0xb9f3f801, 0xb96e3a05, + 0x806e816e, 0xbf0d9972, + 0xbf850002, 0x8f6e896e, + 0xbf820001, 0x8f6e8a6e, + 0xb96f1e06, 0x8f6f8a6f, + 0x806e6f6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x876fff6f, + 0x0000ffff, 0xf4091c37, + 0xfa000050, 0xf4091d37, + 0xfa000060, 0xf4011e77, + 0xfa000074, 0xbf8c0000, + 0x906e8977, 0x876fff6e, + 0x003f8000, 0x906e8677, + 0x876eff6e, 0x02000000, + 0x886e6f6e, 0xb9eef807, + 0x876dff6d, 0x0000ffff, + 0x87fe7e7e, 0x87ea6a6a, + 0xb9faf802, 0xbe80226c, + 0xbf9b0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; static const uint32_t cwsr_trap_arcturus_hex[] = { - 0xbf820001, 0xbf8202d4, + 0xbf820001, 0xbf8202d5, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -1215,99 +1420,37 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fb1605, - 0x807b817b, 0x8e7b847b, - 0x8e76827b, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747074, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a7b7c, - 0xbf85ffe7, 0xbef40172, - 0xbef00080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf85004d, - 0xbe840080, 0xd2890000, - 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, - 0x80048104, 0xd2890002, - 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, - 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, - 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, - 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, - 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, - 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbf820008, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8fb4306, - 0x867bc17b, 0xbf840064, - 0xbf8a0000, 0x867aff6f, - 0x04000000, 0xbf840060, - 0x8e7b867b, 0x8e7b827b, - 0xbef6007b, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, + 0xbf85004d, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -1326,31 +1469,50 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, 0xbefe00c1, 0xbeff00c1, - 0xb8fb2a05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -1370,215 +1532,259 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850051, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xbefc0080, - 0xbf11017c, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850059, - 0xd3d84000, 0x18000100, - 0xd3d84001, 0x18000101, - 0xd3d84002, 0x18000102, - 0xd3d84003, 0x18000103, 0xbe840080, 0xd2890000, - 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, 0x80048104, 0xd2890002, - 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xbefc0080, 0xbf11017c, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850059, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xbe840080, + 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, + 0x00000900, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffa9, 0xbf9c0000, - 0xbf820016, 0xd3d84000, - 0x18000100, 0xd3d84001, - 0x18000101, 0xd3d84002, - 0x18000102, 0xd3d84003, - 0x18000103, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffeb, - 0xbf9c0000, 0xbf8200e3, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x866eff7f, 0x04000000, - 0xbf84001f, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf84001a, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x8078ff78, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x781d0000, - 0xe0510100, 0x781d0000, - 0x807cff7c, 0x00000200, - 0x8078ff78, 0x00000200, - 0xbf0a6f7c, 0xbf85fff6, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffa9, + 0xbf9c0000, 0xbf820016, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffeb, 0xbf9c0000, + 0xbf8200e3, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x866eff7f, + 0x04000000, 0xbf84001f, 0xbefe00c1, 0xbeff00c1, + 0xb8ef4306, 0x866fc16f, + 0xbf84001a, 0x8e6f866f, + 0x8e6f826f, 0xbef6006f, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x8078ff78, 0x00000080, 0xbef600ff, 0x01000000, - 0xb8ef2a05, 0x806f816f, - 0x8e6f826f, 0x806fff6f, - 0x00008000, 0xbef80080, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefc0084, - 0xbf11087c, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffee, 0xbefc0080, - 0xbf11087c, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0xd3d94000, 0x18000100, - 0xd3d94001, 0x18000101, - 0xd3d94002, 0x18000102, - 0xd3d94003, 0x18000103, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffea, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xbf8c0f70, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbefe00c1, + 0xbeff00c1, 0xbef600ff, + 0x01000000, 0xb8ef2a05, + 0x806f816f, 0x8e6f826f, + 0x806fff6f, 0x00008000, + 0xbef80080, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe0070, 0xbeff0071, - 0x866f7bff, 0x000003ff, - 0xb96f4803, 0x866f7bff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2a05, 0x806e816e, - 0x8e6e8a6e, 0x8e6e816e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; static const uint32_t cwsr_trap_aldebaran_hex[] = { - 0xbf820001, 0xbf8202df, + 0xbf820001, 0xbf8202e0, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -1695,99 +1901,37 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fb1605, - 0x807b817b, 0x8e7b847b, - 0x8e76827b, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747074, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a7b7c, - 0xbf85ffe7, 0xbef40172, - 0xbef00080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf85004d, - 0xbe840080, 0xd2890000, - 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, - 0x80048104, 0xd2890002, - 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, - 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, - 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, - 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, - 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, - 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbf820008, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8fb4306, - 0x867bc17b, 0xbf840064, - 0xbf8a0000, 0x867aff6f, - 0x04000000, 0xbf840060, - 0x8e7b867b, 0x8e7b827b, - 0xbef6007b, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, + 0xbf85004d, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -1806,31 +1950,50 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, 0xbefe00c1, 0xbeff00c1, - 0xb8fb2b05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -1850,51 +2013,31 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, - 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, - 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, - 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xb8fb2985, - 0x807b817b, 0x8e7b837b, - 0xb8fa2b05, 0x807a817a, - 0x8e7a827a, 0x80fb7a7b, - 0x867b7b7b, 0xbf84007a, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2b05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, 0x807bff7b, 0x00001000, - 0xbefc0080, 0xbf11017c, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850059, 0xd3d84000, - 0x18000100, 0xd3d84001, - 0x18000101, 0xd3d84002, - 0x18000102, 0xd3d84003, - 0x18000103, 0xbe840080, + 0xbf850051, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -1933,143 +2076,207 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0x807c847c, - 0xbf0a7b7c, 0xbf85ffa9, - 0xbf9c0000, 0xbf820016, - 0xd3d84000, 0x18000100, - 0xd3d84001, 0x18000101, - 0xd3d84002, 0x18000102, - 0xd3d84003, 0x18000103, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0x807c847c, 0x8070ff70, 0x00000400, 0xbf0a7b7c, - 0xbf85ffeb, 0xbf9c0000, - 0xbf8200ee, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x04000000, 0xbf84001f, + 0xbf85ffef, 0xbf9c0000, + 0xb8fb2985, 0x807b817b, + 0x8e7b837b, 0xb8fa2b05, + 0x807a817a, 0x8e7a827a, + 0x80fb7a7b, 0x867b7b7b, + 0xbf84007a, 0x807bff7b, + 0x00001000, 0xbefc0080, + 0xbf11017c, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf8200ee, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x04000000, + 0xbf84001f, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf84001a, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82985, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf84001a, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82985, 0x80788178, - 0x8e788a78, 0x8e788178, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbefe00c1, - 0xbeff00c1, 0xbef600ff, - 0x01000000, 0xb8ef2b05, - 0x806f816f, 0x8e6f826f, - 0x806fff6f, 0x00008000, - 0xbef80080, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xb8ef2985, 0x806f816f, - 0x8e6f836f, 0xb8f92b05, - 0x80798179, 0x8e798279, - 0x80ef796f, 0x866f6f6f, - 0xbf84001a, 0x806fff6f, - 0x00008000, 0xbefc0080, + 0xb8ef2b05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, 0xbf11087c, 0xe0524000, 0x781d0000, 0xe0524100, 0x781d0100, 0xe0524200, 0x781d0200, 0xe0524300, 0x781d0300, 0xbf8c0f70, - 0xd3d94000, 0x18000100, - 0xd3d94001, 0x18000101, - 0xd3d94002, 0x18000102, - 0xd3d94003, 0x18000103, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0x807c847c, 0x8078ff78, 0x00000400, 0xbf0a6f7c, - 0xbf85ffea, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xbf8c0f70, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbf85ffee, 0xb8ef2985, + 0x806f816f, 0x8e6f836f, + 0xb8f92b05, 0x80798179, + 0x8e798279, 0x80ef796f, + 0x866f6f6f, 0xbf84001a, + 0x806fff6f, 0x00008000, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe0070, 0xbeff0071, - 0x866f7bff, 0x000003ff, - 0xb96f4803, 0x866f7bff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2985, 0x806e816e, - 0x8e6e8a6e, 0x8e6e816e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2985, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx10_hex[] = { - 0xbf820001, 0xbf820221, + 0xbf820001, 0xbf820220, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -2095,19 +2302,19 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c0000, 0x8f6e976e, 0x8a77ff77, 0x00800000, 0x88776e77, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, + 0xfa000000, 0xbf8c0000, 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c0000, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff0000, - 0xbf850005, 0x8878ff78, - 0x00002000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x01000000, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff0000, + 0xbf850008, 0x876eff6d, + 0x01000000, 0xbf850007, + 0x8878ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x0000ffff, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, @@ -2115,7 +2322,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x0000ffff, 0xbefa0380, 0xb9fa0283, 0xbeee037e, 0xbeef037f, 0xbefe0480, - 0xbf900004, 0xbf8cc07f, + 0xbf900004, 0xbf8c0000, 0x877aff7f, 0x04000000, 0x8f7a857a, 0x886d7a6d, 0x7e008200, 0xbefa037e, @@ -2268,94 +2475,93 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840044, 0xbf8a0000, + 0xbf840043, 0xbf8a0000, 0x877aff6d, 0x80000000, - 0xbf840040, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, - 0x8070ff70, 0x00000080, - 0xbef603ff, 0x01000000, - 0xd7650000, 0x000100c1, - 0xd7660000, 0x000200c1, - 0x16000084, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850012, - 0xbe8303ff, 0x00000080, + 0xbf84003f, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef603ff, + 0x01000000, 0xd7650000, + 0x000100c1, 0xd7660000, + 0x000200c1, 0x16000084, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850012, 0xbe8303ff, + 0x00000080, 0xbf800000, 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000080, 0xbf0a7b7c, - 0xbf85fff4, 0xbf820011, - 0xbe8303ff, 0x00000100, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a7b7c, 0xbf85fff4, + 0xbf820011, 0xbe8303ff, + 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85fff4, 0xbefe03c1, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850004, - 0xbef003ff, 0x00000200, - 0xbeff0380, 0xbf820003, - 0xbef003ff, 0x00000400, - 0xbeff03c1, 0xb97b3a05, - 0x807b817b, 0x8f7b827b, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850017, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a7b7c, 0xbf85fff4, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x00000200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x00000400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850017, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf840037, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000200, 0xbf0a7b7c, + 0xbf85ffef, 0xbf820025, 0xbef603ff, 0x01000000, 0xbefc0384, 0xbf0a7b7c, - 0xbf840037, 0x7e008700, + 0xbf840011, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, 0x705d0300, 0x807c847c, - 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000400, 0xbf0a7b7c, 0xbf85ffef, - 0xbf820025, 0xbef603ff, - 0x01000000, 0xbefc0384, - 0xbf0a7b7c, 0xbf840011, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, + 0xb97b1e06, 0x877bc17b, + 0xbf84000c, 0x8f7b837b, + 0x807b7c7b, 0xbefe03c1, + 0xbeff0380, 0x7e008700, 0xe0704000, 0x705d0000, - 0xe0704100, 0x705d0100, - 0xe0704200, 0x705d0200, - 0xe0704300, 0x705d0300, - 0x807c847c, 0x8070ff70, - 0x00000400, 0xbf0a7b7c, - 0xbf85ffef, 0xb97b1e06, - 0x877bc17b, 0xbf84000c, - 0x8f7b837b, 0x807b7c7b, - 0xbefe03c1, 0xbeff0380, - 0x7e008700, 0xe0704000, - 0x705d0000, 0x807c817c, - 0x8070ff70, 0x00000080, - 0xbf0a7b7c, 0xbf85fff8, - 0xbf82013b, 0xbef4037e, - 0x8775ff7f, 0x0000ffff, - 0x8875ff75, 0x00040000, - 0xbef60380, 0xbef703ff, - 0x10807fac, 0xb97202dc, - 0x8f729972, 0x876eff7f, - 0x04000000, 0xbf840034, - 0xbefe03c1, 0x907c9972, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0xbeff0380, - 0xbf820001, 0xbeff03c1, - 0xb96f4306, 0x876fc16f, - 0xbf840029, 0x8f6f866f, - 0x8f6f826f, 0xbef6036f, + 0x807c817c, 0x8070ff70, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff8, 0xbf820136, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x10807fac, + 0xb97202dc, 0x8f729972, + 0x876eff7f, 0x04000000, + 0xbf840033, 0xbefe03c1, + 0x907c9972, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb96f4306, + 0x876fc16f, 0xbf840028, + 0x8f6f886f, 0xbef6036f, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, @@ -2391,7 +2597,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304080, 0x785d0100, 0xe0304100, 0x785d0200, 0xe0304180, 0x785d0300, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, 0x00000200, @@ -2400,7 +2606,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304080, 0x6e5d0100, 0xe0304100, 0x6e5d0200, 0xe0304180, 0x6e5d0300, - 0xbf8c3f70, 0xbf820034, + 0xbf8c0000, 0xbf820034, 0xbef603ff, 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000400, 0xbefc0384, @@ -2409,7 +2615,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304100, 0x785d0100, 0xe0304200, 0x785d0200, 0xe0304300, 0x785d0300, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, 0x00000400, @@ -2418,7 +2624,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf84000e, 0x8f6f836f, 0x806f7c6f, 0xbefe03c1, 0xbeff0380, 0xe0304000, - 0x785d0000, 0xbf8c3f70, + 0x785d0000, 0xbf8c0000, 0x7e008500, 0x807c817c, 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, @@ -2426,7 +2632,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x6e5d0000, 0xe0304100, 0x6e5d0100, 0xe0304200, 0x6e5d0200, 0xe0304300, - 0x6e5d0300, 0xbf8c3f70, + 0x6e5d0300, 0xbf8c0000, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, @@ -2437,16 +2643,16 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbef603ff, 0x01000000, 0xbefc03ff, 0x0000006c, 0x80f89078, 0xf429003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc847c, 0xbf800000, 0xbe803100, 0xbe823102, 0x80f8a078, 0xf42d003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc887c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0x80f8c078, 0xf431003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc907c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, @@ -2476,15 +2682,13 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x80788478, 0xf4211cfa, 0xf0000000, 0x80788478, 0xf4211bba, 0xf0000000, - 0x80788478, 0xbf8cc07f, + 0x80788478, 0xbf8c0000, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef815, + 0xbf8c0000, 0xb9eef815, 0xbefc036f, 0xbefe0370, - 0xbeff0371, 0x876f7bff, - 0x000003ff, 0xb9ef4803, - 0x876f7bff, 0xfffff800, - 0x906f8b6f, 0xb9efa2c3, + 0xbeff0371, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2496,7 +2700,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x0000ffff, 0xf4091c37, 0xfa000050, 0xf4091d37, 0xfa000060, 0xf4011e77, - 0xfa000074, 0xbf8cc07f, + 0xfa000074, 0xbf8c0000, 0x876dff6d, 0x0000ffff, 0x87fe7e7e, 0x87ea6a6a, 0xb9faf802, 0xbe80226c, @@ -2506,7 +2710,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { }; static const uint32_t cwsr_trap_gfx11_hex[] = { - 0xbfa00001, 0xbfa00225, + 0xbfa00001, 0xbfa00227, 0xb0804006, 0xb8f8f802, 0x9178ff78, 0x00020006, 0xb8fbf803, 0xbf0d9e6d, @@ -2518,7 +2722,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x8b6eff7b, 0x00000400, 0xbfa20045, 0xbf830010, 0xb8fbf803, 0xbfa0fffa, - 0x8b6eff7b, 0x00000900, + 0x8b6eff7b, 0x00160900, 0xbfa20015, 0x8b6eff7b, 0x000071ff, 0xbfa10008, 0x8b6fff7b, 0x00007080, @@ -2530,187 +2734,188 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x8b6eff6e, 0x00000800, 0xbfa20003, 0x8b6eff7b, 0x00000400, 0xbfa2002a, - 0xbefa4d82, 0xbf89fc07, + 0xbefa4d82, 0xbf890000, 0x84fa887a, 0xbf0d8f7b, 0xbfa10002, 0x8c7bff7b, 0xffff0000, 0xf4005bbd, - 0xf8000010, 0xbf89fc07, + 0xf8000010, 0xbf890000, 0x846e976e, 0x9177ff77, 0x00800000, 0x8c776e77, 0xf4045bbd, 0xf8000000, - 0xbf89fc07, 0xf4045ebd, - 0xf8000008, 0xbf89fc07, + 0xbf890000, 0xf4045ebd, + 0xf8000008, 0xbf890000, 0x8bee6e6e, 0xbfa10001, 0xbe80486e, 0x8b6eff6d, - 0x01ff0000, 0xbfa20005, - 0x8c78ff78, 0x00002000, - 0x80ec886c, 0x82ed806d, - 0xbfa00005, 0x8b6eff6d, - 0x01000000, 0xbfa20002, + 0x00ff0000, 0xbfa20008, + 0x8b6eff6d, 0x01000000, + 0xbfa20007, 0x8c78ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xbfa00002, 0x806c846c, 0x826d806d, 0x8b6dff6d, 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, 0xb978f802, 0xbe804a6c, - 0x8b6dff6d, 0x0000ffff, - 0xbefa0080, 0xb97a0283, - 0xbeee007e, 0xbeef007f, - 0xbefe0180, 0xbefe4d84, - 0xbf89fc07, 0x8b7aff7f, - 0x04000000, 0x847a857a, - 0x8c6d7a6d, 0xbefa007e, - 0x8b7bff7f, 0x0000ffff, - 0xbefe00c1, 0xbeff00c1, - 0xdca6c000, 0x007a0000, - 0x7e000280, 0xbefe007a, - 0xbeff007b, 0xb8fb02dc, - 0x847b997b, 0xb8fa3b05, - 0x807a817a, 0xbf0d997b, - 0xbfa20002, 0x847a897a, - 0xbfa00001, 0x847a8a7a, - 0xb8fb1e06, 0x847b8a7b, - 0x807a7b7a, 0x8b7bff7f, - 0x0000ffff, 0x807aff7a, - 0x00000200, 0x807a7e7a, - 0x827b807b, 0xd7610000, - 0x00010870, 0xd7610000, - 0x00010a71, 0xd7610000, - 0x00010c72, 0xd7610000, - 0x00010e73, 0xd7610000, - 0x00011074, 0xd7610000, - 0x00011275, 0xd7610000, - 0x00011476, 0xd7610000, - 0x00011677, 0xd7610000, - 0x00011a79, 0xd7610000, - 0x00011c7e, 0xd7610000, - 0x00011e7f, 0xbefe00ff, - 0x00003fff, 0xbeff0080, - 0xdca6c040, 0x007a0000, - 0xd760007a, 0x00011d00, - 0xd760007b, 0x00011f00, + 0xbf0d9878, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x0000ffff, 0xbefa0080, + 0xb97a0283, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbefe4d84, 0xbf890000, + 0x8b7aff7f, 0x04000000, + 0x847a857a, 0x8c6d7a6d, + 0xbefa007e, 0x8b7bff7f, + 0x0000ffff, 0xbefe00c1, + 0xbeff00c1, 0xdca6c000, + 0x007a0000, 0x7e000280, 0xbefe007a, 0xbeff007b, - 0xbef4007e, 0x8b75ff7f, - 0x0000ffff, 0x8c75ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x10807fac, - 0xbef1007d, 0xbef00080, - 0xb8f302dc, 0x84739973, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20002, 0xbeff0080, - 0xbfa00002, 0xbeff00c1, - 0xbfa00009, 0xbef600ff, - 0x01000000, 0xe0685080, - 0x701d0100, 0xe0685100, - 0x701d0200, 0xe0685180, - 0x701d0300, 0xbfa00008, + 0xb8fb02dc, 0x847b997b, + 0xb8fa3b05, 0x807a817a, + 0xbf0d997b, 0xbfa20002, + 0x847a897a, 0xbfa00001, + 0x847a8a7a, 0xb8fb1e06, + 0x847b8a7b, 0x807a7b7a, + 0x8b7bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xd7610000, 0x00010870, + 0xd7610000, 0x00010a71, + 0xd7610000, 0x00010c72, + 0xd7610000, 0x00010e73, + 0xd7610000, 0x00011074, + 0xd7610000, 0x00011275, + 0xd7610000, 0x00011476, + 0xd7610000, 0x00011677, + 0xd7610000, 0x00011a79, + 0xd7610000, 0x00011c7e, + 0xd7610000, 0x00011e7f, + 0xbefe00ff, 0x00003fff, + 0xbeff0080, 0xdca6c040, + 0x007a0000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007d, + 0xbef00080, 0xb8f302dc, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa00009, 0xbef600ff, 0x01000000, - 0xe0685100, 0x701d0100, - 0xe0685200, 0x701d0200, - 0xe0685300, 0x701d0300, + 0xe0685080, 0x701d0100, + 0xe0685100, 0x701d0200, + 0xe0685180, 0x701d0300, + 0xbfa00008, 0xbef600ff, + 0x01000000, 0xe0685100, + 0x701d0100, 0xe0685200, + 0x701d0200, 0xe0685300, + 0x701d0300, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef600ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbefd0080, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xd7610002, 0x0000fa6c, + 0x807d817d, 0x917aff6d, + 0x80000000, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xd7610002, 0x0000fa6e, + 0x807d817d, 0xd7610002, + 0x0000fa6f, 0x807d817d, + 0xd7610002, 0x0000fa78, + 0x807d817d, 0xb8faf803, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xd7610002, + 0x0000fa7b, 0x807d817d, + 0xb8f1f801, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f814, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f815, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xbefe00ff, 0x0000ffff, + 0xbeff0080, 0xe0685000, + 0x701d0200, 0xbefe00c1, 0xb8f03b05, 0x80708170, 0xbf0d9973, 0xbfa20002, 0x84708970, 0xbfa00001, 0x84708a70, 0xb8fa1e06, 0x847a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, 0xbef600ff, 0x01000000, - 0x7e000280, 0x7e020280, - 0x7e040280, 0xbefd0080, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xd7610002, - 0x0000fa6c, 0x807d817d, - 0x917aff6d, 0x80000000, - 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xd7610002, - 0x0000fa6e, 0x807d817d, - 0xd7610002, 0x0000fa6f, - 0x807d817d, 0xd7610002, - 0x0000fa78, 0x807d817d, - 0xb8faf803, 0xd7610002, - 0x0000fa7a, 0x807d817d, - 0xd7610002, 0x0000fa7b, - 0x807d817d, 0xb8f1f801, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xb8f1f814, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xb8f1f815, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xbefe00ff, - 0x0000ffff, 0xbeff0080, - 0xe0685000, 0x701d0200, - 0xbefe00c1, 0xb8f03b05, - 0x80708170, 0xbf0d9973, - 0xbfa20002, 0x84708970, - 0xbfa00001, 0x84708a70, - 0xb8fa1e06, 0x847a8a7a, - 0x80707a70, 0xbef600ff, - 0x01000000, 0xbef90080, - 0xbefd0080, 0xbf800000, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xbe8c410c, 0xbe8e410e, - 0xd7610002, 0x0000f200, - 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, + 0xbef90080, 0xbefd0080, + 0xbf800000, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xbe8c410c, + 0xbe8e410e, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, - 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xd7610002, 0x0000f20c, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, 0x80798179, 0xd7610002, - 0x0000f20d, 0x80798179, - 0xd7610002, 0x0000f20e, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, 0x80798179, 0xd7610002, - 0x0000f20f, 0x80798179, - 0xbf06a079, 0xbfa10006, - 0xe0685000, 0x701d0200, - 0x8070ff70, 0x00000080, - 0xbef90080, 0x7e040280, - 0x807d907d, 0xbf0aff7d, - 0x00000060, 0xbfa2ffbc, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xd7610002, 0x0000f200, + 0x0000f20c, 0x80798179, + 0xd7610002, 0x0000f20d, 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, + 0x0000f20e, 0x80798179, + 0xd7610002, 0x0000f20f, + 0x80798179, 0xbf06a079, + 0xbfa10006, 0xe0685000, + 0x701d0200, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ffbc, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xe0685000, 0x701d0200, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20002, 0xbeff0080, - 0xbfa00001, 0xbeff00c1, - 0xb8fb4306, 0x8b7bc17b, - 0xbfa10044, 0xbfbd0000, - 0x8b7aff6d, 0x80000000, - 0xbfa10040, 0x847b867b, - 0x847b827b, 0xbef6007b, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, + 0x80798179, 0xe0685000, + 0x701d0200, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8fb4306, + 0x8b7bc17b, 0xbfa10043, + 0xbfbd0000, 0x8b7aff6d, + 0x80000000, 0xbfa1003f, + 0x847b887b, 0xbef6007b, 0xb8f03b05, 0x80708170, 0xbf0d9973, 0xbfa20002, 0x84708970, 0xbfa00001, @@ -2781,177 +2986,175 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x701d0000, 0x807d817d, 0x8070ff70, 0x00000080, 0xbf0a7b7d, 0xbfa2fff8, - 0xbfa00146, 0xbef4007e, + 0xbfa00143, 0xbef4007e, 0x8b75ff7f, 0x0000ffff, 0x8c75ff75, 0x00040000, 0xbef60080, 0xbef700ff, 0x10807fac, 0xb8f202dc, 0x84729972, 0x8b6eff7f, - 0x04000000, 0xbfa1003a, + 0x04000000, 0xbfa10039, 0xbefe00c1, 0x857d9972, 0x8b7d817d, 0xbf06817d, 0xbfa20002, 0xbeff0080, 0xbfa00001, 0xbeff00c1, 0xb8ef4306, 0x8b6fc16f, - 0xbfa1002f, 0x846f866f, - 0x846f826f, 0xbef6006f, - 0xb8f83b05, 0x80788178, - 0xbf0d9972, 0xbfa20002, - 0x84788978, 0xbfa00001, - 0x84788a78, 0xb8ee1e06, - 0x846e8a6e, 0x80786e78, + 0xbfa1002e, 0x846f886f, + 0xbef6006f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa2000c, + 0xe0500000, 0x781d0000, + 0xbf890000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000080, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff5, 0xbfa0000b, + 0xe0500000, 0x781d0000, + 0xbf890000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2fff5, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20024, 0xbef600ff, + 0x01000000, 0xbeee0078, 0x8078ff78, 0x00000200, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbefd0080, - 0xbfa2000c, 0xe0500000, - 0x781d0000, 0xbf8903f7, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000080, - 0x8078ff78, 0x00000080, - 0xbf0a6f7d, 0xbfa2fff5, - 0xbfa0000b, 0xe0500000, - 0x781d0000, 0xbf8903f7, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000100, - 0x8078ff78, 0x00000100, - 0xbf0a6f7d, 0xbfa2fff5, - 0xbef80080, 0xbefe00c1, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8ef3b05, - 0x806f816f, 0x846f826f, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20024, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000200, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10050, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10050, 0xe0505000, + 0x781d0000, 0xe0505080, + 0x781d0100, 0xe0505100, + 0x781d0200, 0xe0505180, + 0x781d0300, 0xbf890000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffee, 0xe0505000, + 0x6e1d0000, 0xe0505080, + 0x6e1d0100, 0xe0505100, + 0x6e1d0200, 0xe0505180, + 0x6e1d0300, 0xbf890000, + 0xbfa00034, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10012, 0xe0505000, + 0x781d0000, 0xe0505100, + 0x781d0100, 0xe0505200, + 0x781d0200, 0xe0505300, + 0x781d0300, 0xbf890000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffee, 0xb8ef1e06, + 0x8b6fc16f, 0xbfa1000e, + 0x846f836f, 0x806f7d6f, + 0xbefe00c1, 0xbeff0080, 0xe0505000, 0x781d0000, - 0xe0505080, 0x781d0100, - 0xe0505100, 0x781d0200, - 0xe0505180, 0x781d0300, - 0xbf8903f7, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000200, - 0xbf0a6f7d, 0xbfa2ffee, + 0xbf890000, 0x7e008500, + 0x807d817d, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff7, 0xbeff00c1, 0xe0505000, 0x6e1d0000, - 0xe0505080, 0x6e1d0100, - 0xe0505100, 0x6e1d0200, - 0xe0505180, 0x6e1d0300, - 0xbf8903f7, 0xbfa00034, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10012, - 0xe0505000, 0x781d0000, - 0xe0505100, 0x781d0100, - 0xe0505200, 0x781d0200, - 0xe0505300, 0x781d0300, - 0xbf8903f7, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000400, - 0xbf0a6f7d, 0xbfa2ffee, - 0xb8ef1e06, 0x8b6fc16f, - 0xbfa1000e, 0x846f836f, - 0x806f7d6f, 0xbefe00c1, - 0xbeff0080, 0xe0505000, - 0x781d0000, 0xbf8903f7, - 0x7e008500, 0x807d817d, - 0x8078ff78, 0x00000080, - 0xbf0a6f7d, 0xbfa2fff7, - 0xbeff00c1, 0xe0505000, - 0x6e1d0000, 0xe0505100, - 0x6e1d0100, 0xe0505200, - 0x6e1d0200, 0xe0505300, - 0x6e1d0300, 0xbf8903f7, + 0xe0505100, 0x6e1d0100, + 0xe0505200, 0x6e1d0200, + 0xe0505300, 0x6e1d0300, + 0xbf890000, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x80f8ff78, + 0x00000050, 0xbef600ff, + 0x01000000, 0xbefd00ff, + 0x0000006c, 0x80f89078, + 0xf428403a, 0xf0000000, + 0xbf890000, 0x80fd847d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0x80f8a078, + 0xf42c403a, 0xf0000000, + 0xbf890000, 0x80fd887d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0x80f8c078, + 0xf430403a, 0xf0000000, + 0xbf890000, 0x80fd907d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0xbe884308, + 0xbe8a430a, 0xbe8c430c, + 0xbe8e430e, 0xbf06807d, + 0xbfa1fff0, 0xb980f801, + 0x00000000, 0xbfbd0000, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, 0x84788a78, 0xb8ee1e06, 0x846e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, - 0x80f8ff78, 0x00000050, 0xbef600ff, 0x01000000, - 0xbefd00ff, 0x0000006c, - 0x80f89078, 0xf428403a, - 0xf0000000, 0xbf89fc07, - 0x80fd847d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0x80f8a078, 0xf42c403a, - 0xf0000000, 0xbf89fc07, - 0x80fd887d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0xbe844304, 0xbe864306, - 0x80f8c078, 0xf430403a, - 0xf0000000, 0xbf89fc07, - 0x80fd907d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0xbe844304, 0xbe864306, - 0xbe884308, 0xbe8a430a, - 0xbe8c430c, 0xbe8e430e, - 0xbf06807d, 0xbfa1fff0, - 0xb980f801, 0x00000000, - 0xbfbd0000, 0xb8f83b05, - 0x80788178, 0xbf0d9972, - 0xbfa20002, 0x84788978, - 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0xbef600ff, - 0x01000000, 0xf4205bfa, + 0xf4205bfa, 0xf0000000, + 0x80788478, 0xf4205b3a, 0xf0000000, 0x80788478, - 0xf4205b3a, 0xf0000000, - 0x80788478, 0xf4205b7a, + 0xf4205b7a, 0xf0000000, + 0x80788478, 0xf4205c3a, 0xf0000000, 0x80788478, - 0xf4205c3a, 0xf0000000, - 0x80788478, 0xf4205c7a, + 0xf4205c7a, 0xf0000000, + 0x80788478, 0xf4205eba, 0xf0000000, 0x80788478, - 0xf4205eba, 0xf0000000, - 0x80788478, 0xf4205efa, + 0xf4205efa, 0xf0000000, + 0x80788478, 0xf4205e7a, 0xf0000000, 0x80788478, - 0xf4205e7a, 0xf0000000, - 0x80788478, 0xf4205cfa, + 0xf4205cfa, 0xf0000000, + 0x80788478, 0xf4205bba, 0xf0000000, 0x80788478, + 0xbf890000, 0xb96ef814, 0xf4205bba, 0xf0000000, - 0x80788478, 0xbf89fc07, - 0xb96ef814, 0xf4205bba, - 0xf0000000, 0x80788478, - 0xbf89fc07, 0xb96ef815, - 0xbefd006f, 0xbefe0070, - 0xbeff0071, 0x8b6f7bff, - 0x000003ff, 0xb96f4803, - 0x8b6f7bff, 0xfffff800, - 0x856f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee3b05, - 0x806e816e, 0xbf0d9972, - 0xbfa20002, 0x846e896e, - 0xbfa00001, 0x846e8a6e, - 0xb8ef1e06, 0x846f8a6f, - 0x806e6f6e, 0x806eff6e, - 0x00000200, 0x806e746e, - 0x826f8075, 0x8b6fff6f, - 0x0000ffff, 0xf4085c37, - 0xf8000050, 0xf4085d37, - 0xf8000060, 0xf4005e77, - 0xf8000074, 0xbf89fc07, - 0x8b6dff6d, 0x0000ffff, - 0x8bfe7e7e, 0x8bea6a6a, - 0xb8eef802, 0xbf0d866e, - 0xbfa20002, 0xb97af802, - 0xbe80486c, 0xb97af802, - 0xbe804a6c, 0xbfb10000, + 0x80788478, 0xbf890000, + 0xb96ef815, 0xbefd006f, + 0xbefe0070, 0xbeff0071, + 0xb97b4803, 0x857b8b7b, + 0xb97b22c3, 0x857b867b, + 0xb97b7443, 0xb973f801, + 0xb8ee3b05, 0x806e816e, + 0xbf0d9972, 0xbfa20002, + 0x846e896e, 0xbfa00001, + 0x846e8a6e, 0xb8ef1e06, + 0x846f8a6f, 0x806e6f6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x8b6fff6f, 0x0000ffff, + 0xf4085c37, 0xf8000050, + 0xf4085d37, 0xf8000060, + 0xf4005e77, 0xf8000074, + 0xbf890000, 0x8b6dff6d, + 0x0000ffff, 0x8bfe7e7e, + 0x8bea6a6a, 0xb8eef802, + 0xbf0d866e, 0xbfa20002, + 0xb97af802, 0xbe80486c, + 0xb97af802, 0xbe804a6c, + 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { - 0xbf820001, 0xbf8202db, + 0xbf820001, 0xbf8202dc, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -3066,99 +3269,143 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fb1605, - 0x807b817b, 0x8e7b847b, - 0x8e76827b, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747074, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a7b7c, - 0xbf85ffe7, 0xbef40172, - 0xbef00080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf85004d, - 0xbe840080, 0xd2890000, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf85004d, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, - 0x80048104, 0xd2890002, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000901, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0xbf820008, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8fb4306, - 0x867bc17b, 0xbf840064, - 0xbf8a0000, 0x867aff6f, - 0x04000000, 0xbf840060, - 0x8e7b867b, 0x8e7b827b, - 0xbef6007b, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2b05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, + 0xbf850051, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -3177,31 +3424,51 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8fb2b05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xb8fb2985, 0x807b817b, + 0x8e7b837b, 0xb8fa2b05, + 0x807a817a, 0x8e7a827a, + 0x80fb7a7b, 0x867b7b7b, + 0xbf84007a, 0x807bff7b, + 0x00001000, 0xbefc0080, + 0xbf11017c, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -3241,31 +3508,913 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0x807c847c, 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xb8fb2985, - 0x807b817b, 0x8e7b837b, - 0xb8fa2b05, 0x807a817a, - 0x8e7a827a, 0x80fb7a7b, - 0x867b7b7b, 0xbf84007a, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf8200ee, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x04000000, + 0xbf84001f, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf84001a, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82985, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, + 0xbefe00c1, 0xbeff00c1, + 0xbef600ff, 0x01000000, + 0xb8ef2b05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xb8ef2985, + 0x806f816f, 0x8e6f836f, + 0xb8f92b05, 0x80798179, + 0x8e798279, 0x80ef796f, + 0x866f6f6f, 0xbf84001a, + 0x806fff6f, 0x00008000, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, + 0x00000078, 0x80788478, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, + 0x00000078, 0x80788478, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, + 0x00000078, 0x80788478, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, + 0x00000078, 0x80788478, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, + 0x00000078, 0x80788478, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2985, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b79, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, +}; + +static const uint32_t cwsr_trap_gfx12_hex[] = { + 0xbfa00001, 0xbfa002a2, + 0xb0804009, 0xb8f8f804, + 0x9178ff78, 0x00008c00, + 0xb8fbf811, 0x8b6eff78, + 0x00004000, 0xbfa10008, + 0x8b6eff7b, 0x00000080, + 0xbfa20018, 0x8b6ea07b, + 0xbfa20042, 0xbf830010, + 0xb8fbf811, 0xbfa0fffb, + 0x8b6eff7b, 0x00000bd0, + 0xbfa20010, 0xb8eef812, + 0x8b6f8f7b, 0xbfa10002, + 0x8c6eff6e, 0x00000080, + 0xb8eff813, 0x8b6e6e6f, + 0xbfa20008, 0x8b6eff6d, + 0xf0000000, 0xbfa20005, + 0x8b6fff6f, 0x00000200, + 0xbfa20002, 0x8b6ea07b, + 0xbfa2002c, 0xbefa4d82, + 0xbf8a0000, 0x84fa887a, + 0xbf0d8f7b, 0xbfa10002, + 0x8c7bff7b, 0xffff0000, + 0xf4601bbd, 0xf8000010, + 0xbf8a0000, 0x846e976e, + 0x9177ff77, 0x00800000, + 0x8c776e77, 0xf4603bbd, + 0xf8000000, 0xbf8a0000, + 0xf4603ebd, 0xf8000008, + 0xbf8a0000, 0x8bee6e6e, + 0xbfa10001, 0xbe80486e, + 0x8b6eff6d, 0xf0000000, + 0xbfa20009, 0xb8eef811, + 0x8b6eff6e, 0x00000080, + 0xbfa20007, 0x8c78ff78, + 0x00004000, 0x80ec886c, + 0x82ed806d, 0xbfa00002, + 0x806c846c, 0x826d806d, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0x85788978, 0xb9783244, + 0xbe804a6c, 0xb8faf802, + 0xbf0d987a, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x0000ffff, 0xbefa0080, + 0xb97a0151, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbefe4d84, 0xbf8a0000, + 0x8b7aff7f, 0x04000000, + 0x847a857a, 0x8c6d7a6d, + 0xbefa007e, 0x8b7bff7f, + 0x0000ffff, 0xbefe00c1, + 0xbeff00c1, 0xee0a407a, + 0x000c0000, 0x00000000, + 0x7e000280, 0xbefe007a, + 0xbeff007b, 0xb8fb0742, + 0x847b997b, 0xb8fa3b05, + 0x807a817a, 0xbf0d997b, + 0xbfa20002, 0x847a897a, + 0xbfa00001, 0x847a8a7a, + 0xb8fb1e06, 0x847b8a7b, + 0x807a7b7a, 0x8b7bff7f, + 0x0000ffff, 0x807aff7a, + 0x00000200, 0x807a7e7a, + 0x827b807b, 0xd7610000, + 0x00010870, 0xd7610000, + 0x00010a71, 0xd7610000, + 0x00010c72, 0xd7610000, + 0x00010e73, 0xd7610000, + 0x00011074, 0xd7610000, + 0x00011275, 0xd7610000, + 0x00011476, 0xd7610000, + 0x00011677, 0xd7610000, + 0x00011a79, 0xd7610000, + 0x00011c7e, 0xd7610000, + 0x00011e7f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xbefe00ff, + 0x00003fff, 0xbeff0080, + 0xee0a407a, 0x000c0000, + 0x00004000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007d, + 0xbef00080, 0xb8f30742, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa0000c, + 0xbef600ff, 0x01000000, + 0xc4068070, 0x008ce801, + 0x00008000, 0xc4068070, + 0x008ce802, 0x00010000, + 0xc4068070, 0x008ce803, + 0x00018000, 0xbfa0000b, + 0xbef600ff, 0x01000000, + 0xc4068070, 0x008ce801, + 0x00010000, 0xc4068070, + 0x008ce802, 0x00020000, + 0xc4068070, 0x008ce803, + 0x00030000, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef600ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbe804ec2, 0xbf94fffe, + 0xb8faf804, 0x8b7a847a, + 0x91788478, 0x8c787a78, + 0x917aff6d, 0x80000000, + 0xd7610002, 0x00010071, + 0xd7610002, 0x0001026c, + 0xd7610002, 0x0001047a, + 0xd7610002, 0x0001066e, + 0xd7610002, 0x0001086f, + 0xd7610002, 0x00010a78, + 0xd7610002, 0x00010e7b, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xb8faf811, 0xd7610002, + 0x00010c7a, 0xb8faf801, + 0xd7610002, 0x0001107a, + 0xb8faf814, 0xd7610002, + 0x0001127a, 0xb8faf815, + 0xd7610002, 0x0001147a, + 0xb8faf812, 0xd7610002, + 0x0001167a, 0xb8faf813, + 0xd7610002, 0x0001187a, + 0xb8faf802, 0xd7610002, + 0x00011a7a, 0xbefa50c1, + 0xbfc70000, 0xd7610002, + 0x00011c7a, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xbefe00ff, + 0x0000ffff, 0xbeff0080, + 0xc4068070, 0x008ce802, + 0x00000000, 0xbefe00c1, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xb8fa1e06, + 0x847a8a7a, 0x80707a70, + 0xbef600ff, 0x01000000, + 0xbef90080, 0xbefd0080, + 0xbf800000, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xbe8c410c, + 0xbe8e410e, 0xbf068079, + 0xbfa10032, 0xd7610002, + 0x00010000, 0xd7610002, + 0x00010201, 0xd7610002, + 0x00010402, 0xd7610002, + 0x00010603, 0xd7610002, + 0x00010804, 0xd7610002, + 0x00010a05, 0xd7610002, + 0x00010c06, 0xd7610002, + 0x00010e07, 0xd7610002, + 0x00011008, 0xd7610002, + 0x00011209, 0xd7610002, + 0x0001140a, 0xd7610002, + 0x0001160b, 0xd7610002, + 0x0001180c, 0xd7610002, + 0x00011a0d, 0xd7610002, + 0x00011c0e, 0xd7610002, + 0x00011e0f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0x80799079, + 0xbfa00038, 0xd7610002, + 0x00012000, 0xd7610002, + 0x00012201, 0xd7610002, + 0x00012402, 0xd7610002, + 0x00012603, 0xd7610002, + 0x00012804, 0xd7610002, + 0x00012a05, 0xd7610002, + 0x00012c06, 0xd7610002, + 0x00012e07, 0xd7610002, + 0x00013008, 0xd7610002, + 0x00013209, 0xd7610002, + 0x0001340a, 0xd7610002, + 0x0001360b, 0xd7610002, + 0x0001380c, 0xd7610002, + 0x00013a0d, 0xd7610002, + 0x00013c0e, 0xd7610002, + 0x00013e0f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0x80799079, + 0xc4068070, 0x008ce802, + 0x00000000, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ff88, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x00010000, 0xd7610002, + 0x00010201, 0xd7610002, + 0x00010402, 0xd7610002, + 0x00010603, 0xd7610002, + 0x00010804, 0xd7610002, + 0x00010a05, 0xd7610002, + 0x00010c06, 0xd7610002, + 0x00010e07, 0xd7610002, + 0x00011008, 0xd7610002, + 0x00011209, 0xd7610002, + 0x0001140a, 0xd7610002, + 0x0001160b, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xc4068070, + 0x008ce802, 0x00000000, + 0xbefe00c1, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8fb4306, 0x8b7bc17b, + 0xbfa10044, 0x8b7aff6d, + 0x80000000, 0xbfa10041, + 0x847b897b, 0xbef6007b, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xb8fa1e06, + 0x847a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xd71f0000, 0x000100c1, + 0xd7200000, 0x000200c1, + 0x16000084, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa20013, + 0xbe8300ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8a0000, + 0xc4068070, 0x008ce801, + 0x00000000, 0x807d037d, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a7b7d, 0xbfa2fff3, + 0xbfa00012, 0xbe8300ff, + 0x00000100, 0xbf800000, + 0xbf800000, 0xbf800000, + 0xd8d80000, 0x01000000, + 0xbf8a0000, 0xc4068070, + 0x008ce801, 0x00000000, + 0x807d037d, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a7b7d, + 0xbfa2fff3, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20004, + 0xbef000ff, 0x00000200, + 0xbeff0080, 0xbfa00003, + 0xbef000ff, 0x00000400, + 0xbeff00c1, 0xb8fb3b05, + 0x807b817b, 0x847b827b, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa2001b, + 0xbef600ff, 0x01000000, + 0xbefd0084, 0xbf0a7b7d, + 0xbfa10040, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xc4068070, + 0x008ce800, 0x00000000, + 0xc4068070, 0x008ce801, + 0x00008000, 0xc4068070, + 0x008ce802, 0x00010000, + 0xc4068070, 0x008ce803, + 0x00018000, 0x807d847d, + 0x8070ff70, 0x00000200, + 0xbf0a7b7d, 0xbfa2ffeb, + 0xbfa0002a, 0xbef600ff, + 0x01000000, 0xbefd0084, + 0xbf0a7b7d, 0xbfa10015, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xc4068070, 0x008ce800, + 0x00000000, 0xc4068070, + 0x008ce801, 0x00010000, + 0xc4068070, 0x008ce802, + 0x00020000, 0xc4068070, + 0x008ce803, 0x00030000, + 0x807d847d, 0x8070ff70, + 0x00000400, 0xbf0a7b7d, + 0xbfa2ffeb, 0xb8fb1e06, + 0x8b7bc17b, 0xbfa1000d, + 0x847b837b, 0x807b7d7b, + 0xbefe00c1, 0xbeff0080, + 0x7e008700, 0xc4068070, + 0x008ce800, 0x00000000, + 0x807d817d, 0x8070ff70, + 0x00000080, 0xbf0a7b7d, + 0xbfa2fff7, 0xbfa0016e, + 0xbef4007e, 0x8b75ff7f, + 0x0000ffff, 0x8c75ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x10807fac, + 0xbef1007f, 0xb8f20742, + 0x84729972, 0x8b6eff7f, + 0x04000000, 0xbfa1003b, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef4306, 0x8b6fc16f, + 0xbfa10030, 0x846f896f, + 0xbef6006f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa2000d, + 0xc4050078, 0x0080e800, + 0x00000000, 0xbf8a0000, + 0xdac00000, 0x00000000, + 0x807dff7d, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7d, 0xbfa2fff4, + 0xbfa0000c, 0xc4050078, + 0x0080e800, 0x00000000, + 0xbf8a0000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2fff4, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa2002c, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000200, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10061, 0xc4050078, + 0x008ce800, 0x00000000, + 0xc4050078, 0x008ce801, + 0x00008000, 0xc4050078, + 0x008ce802, 0x00010000, + 0xc4050078, 0x008ce803, + 0x00018000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffea, 0xc405006e, + 0x008ce800, 0x00000000, + 0xc405006e, 0x008ce801, + 0x00008000, 0xc405006e, + 0x008ce802, 0x00010000, + 0xc405006e, 0x008ce803, + 0x00018000, 0xbf8a0000, + 0xbfa0003d, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10016, 0xc4050078, + 0x008ce800, 0x00000000, + 0xc4050078, 0x008ce801, + 0x00010000, 0xc4050078, + 0x008ce802, 0x00020000, + 0xc4050078, 0x008ce803, + 0x00030000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffea, 0xb8ef1e06, + 0x8b6fc16f, 0xbfa1000f, + 0x846f836f, 0x806f7d6f, + 0xbefe00c1, 0xbeff0080, + 0xc4050078, 0x008ce800, + 0x00000000, 0xbf8a0000, + 0x7e008500, 0x807d817d, + 0x8078ff78, 0x00000080, + 0xbf0a6f7d, 0xbfa2fff6, + 0xbeff00c1, 0xc405006e, + 0x008ce800, 0x00000000, + 0xc405006e, 0x008ce801, + 0x00010000, 0xc405006e, + 0x008ce802, 0x00020000, + 0xc405006e, 0x008ce803, + 0x00030000, 0xbf8a0000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0xb8ee1e06, + 0x846e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0x80f8ff78, 0x00000050, + 0xbef600ff, 0x01000000, + 0xbefd00ff, 0x0000006c, + 0x80f89078, 0xf462403a, + 0xf0000000, 0xbf8a0000, + 0x80fd847d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0x80f8a078, 0xf462603a, + 0xf0000000, 0xbf8a0000, + 0x80fd887d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0x80f8c078, 0xf462803a, + 0xf0000000, 0xbf8a0000, + 0x80fd907d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0xbe884308, 0xbe8a430a, + 0xbe8c430c, 0xbe8e430e, + 0xbf06807d, 0xbfa1fff0, + 0xb980f801, 0x00000000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0xb8ee1e06, + 0x846e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0xbef600ff, 0x01000000, + 0xbeff0071, 0xf4621bfa, + 0xf0000000, 0x80788478, + 0xf4621b3a, 0xf0000000, + 0x80788478, 0xf4621b7a, + 0xf0000000, 0x80788478, + 0xf4621c3a, 0xf0000000, + 0x80788478, 0xf4621c7a, + 0xf0000000, 0x80788478, + 0xf4621eba, 0xf0000000, + 0x80788478, 0xf4621efa, + 0xf0000000, 0x80788478, + 0xf4621e7a, 0xf0000000, + 0x80788478, 0xf4621cfa, + 0xf0000000, 0x80788478, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xb96ef814, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xb96ef815, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xb96ef812, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xb96ef813, + 0x8b6eff7f, 0x04000000, + 0xbfa1000d, 0x80788478, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xbf0d806e, 0xbfa10006, + 0x856e906e, 0x8b6e6e6e, + 0xbfa10003, 0xbe804ec1, + 0x816ec16e, 0xbfa0fffb, + 0xbefd006f, 0xbefe0070, + 0xbeff0071, 0xb97b2011, + 0x857b867b, 0xb97b0191, + 0x857b827b, 0xb97bba11, + 0xb973f801, 0xb8ee3b05, + 0x806e816e, 0xbf0d9972, + 0xbfa20002, 0x846e896e, + 0xbfa00001, 0x846e8a6e, + 0xb8ef1e06, 0x846f8a6f, + 0x806e6f6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x8b6fff6f, + 0x0000ffff, 0xf4605c37, + 0xf8000050, 0xf4605d37, + 0xf8000060, 0xf4601e77, + 0xf8000074, 0xbf8a0000, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0xb97af804, 0xbe804ec2, + 0xbf94fffe, 0xbe804a6c, + 0xbe804ec2, 0xbf94fffe, + 0xbfb10000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, +}; + +static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { + 0xbf820001, 0xbf8202ca, + 0xb8f8f802, 0x8978ff78, + 0x00020006, 0xb8fbf803, + 0x866eff78, 0x00002000, + 0xbf840009, 0x866eff6d, + 0x00ff0000, 0xbf85001a, + 0x866eff7b, 0x00000400, + 0xbf850051, 0xbf8e0010, + 0xb8fbf803, 0xbf82fffa, + 0x866eff7b, 0x03c00900, + 0xbf850011, 0x866eff7b, + 0x000071ff, 0xbf840008, + 0x866fff7b, 0x00007080, + 0xbf840001, 0xbeee1a87, + 0xb8eff801, 0x8e6e8c6e, + 0x866e6f6e, 0xbf850006, + 0x866eff6d, 0x00ff0000, + 0xbf850003, 0x866eff7b, + 0x00000400, 0xbf85003a, + 0xb8faf807, 0x867aff7a, + 0x001f8000, 0x8e7a8b7a, + 0x8979ff79, 0xfc000000, + 0x87797a79, 0xba7ff807, + 0x00000000, 0xb8faf812, + 0xb8fbf813, 0x8efa887a, + 0xbf0d8f7b, 0xbf840002, + 0x877bff7b, 0xffff0000, + 0xc0031bbd, 0x00000010, + 0xbf8cc07f, 0x8e6e976e, + 0x8979ff79, 0x00800000, + 0x87796e79, 0xc0071bbd, + 0x00000000, 0xbf8cc07f, + 0xc0071ebd, 0x00000008, + 0xbf8cc07f, 0x86ee6e6e, + 0xbf840001, 0xbe801d6e, + 0x866eff6d, 0x01ff0000, + 0xbf850005, 0x8778ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xbf820005, + 0x866eff6d, 0x01000000, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x866dff6d, + 0x0000ffff, 0x8f7a8b79, + 0x867aff7a, 0x001f8000, + 0xb97af807, 0x86fe7e7e, + 0x86ea6a6a, 0x8f6e8378, + 0xb96ee0c2, 0xbf800002, + 0xb9780002, 0xbe801f6c, + 0x866dff6d, 0x0000ffff, + 0xbefa0080, 0xb97a0283, + 0xb8faf807, 0x867aff7a, + 0x001f8000, 0x8e7a8b7a, + 0x8979ff79, 0xfc000000, + 0x87797a79, 0xba7ff807, + 0x00000000, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbf900004, 0x877a8478, + 0xb97af802, 0xbf8e0002, + 0xbf88fffe, 0xb8fa2985, + 0x807a817a, 0x8e7a8a7a, + 0x8e7a817a, 0xb8fb1605, + 0x807b817b, 0x8e7b867b, + 0x807a7b7a, 0x807a7e7a, + 0x827b807f, 0x867bff7b, + 0x0000ffff, 0xc04b1c3d, + 0x00000050, 0xbf8cc07f, + 0xc04b1d3d, 0x00000060, + 0xbf8cc07f, 0xc0431e7d, + 0x00000074, 0xbf8cc07f, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0xbef1007c, 0xbef00080, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xbefe007c, + 0xbefc0070, 0xc0611c7a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611b3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611b7a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611bba, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611bfa, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611e3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xb8fbf803, + 0xbefe007c, 0xbefc0070, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611a3a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611a7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xb8f1f801, + 0xbefe007c, 0xbefc0070, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf85004d, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8fb5306, 0x867bc17b, + 0xbf840052, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf84004e, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf85001d, + 0x24040682, 0xd86c0000, + 0x00000002, 0xbf8cc07f, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x680404ff, + 0x00000100, 0xd0c9006a, + 0x0000f702, 0xbf87ffe5, + 0xbf820016, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe800077, + 0x8677ff77, 0xff7fffff, + 0x8777ff77, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8cc07f, 0xe0765000, + 0x701d0002, 0x68040702, + 0xd0c9006a, 0x0000f702, + 0xbefe016a, 0xbf87fff6, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2b05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, 0x807bff7b, 0x00001000, - 0xbefc0080, 0xbf11017c, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850059, 0xd3d84000, - 0x18000100, 0xd3d84001, - 0x18000101, 0xd3d84002, - 0x18000102, 0xd3d84003, - 0x18000103, 0xbe840080, + 0xbf850051, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -3304,137 +4453,204 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0x807c847c, - 0xbf0a7b7c, 0xbf85ffa9, - 0xbf9c0000, 0xbf820016, - 0xd3d84000, 0x18000100, - 0xd3d84001, 0x18000101, - 0xd3d84002, 0x18000102, - 0xd3d84003, 0x18000103, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0x807c847c, 0x8070ff70, 0x00000400, 0xbf0a7b7c, - 0xbf85ffeb, 0xbf9c0000, - 0xbf8200ee, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x04000000, 0xbf84001f, + 0xbf85ffef, 0xbf9c0000, + 0xb8fb2985, 0x807b817b, + 0x8e7b837b, 0xb8fa2b05, + 0x807a817a, 0x8e7a827a, + 0x80fb7a7b, 0x867b7b7b, + 0xbf84007a, 0x807bff7b, + 0x00001000, 0xbefc0080, + 0xbf11017c, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf8200f4, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x04000000, + 0xbf840025, 0xbefe00c1, + 0xbeff00c1, 0xb8ef5306, + 0x866fc16f, 0xbf840020, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82985, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0xe0510200, 0x781d0000, + 0xe0510300, 0x781d0000, + 0xe0510400, 0x781d0000, + 0x807cff7c, 0x00000500, + 0x8078ff78, 0x00000500, + 0xbf0a6f7c, 0xbf85fff0, 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf84001a, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82985, 0x80788178, - 0x8e788a78, 0x8e788178, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbefe00c1, - 0xbeff00c1, 0xbef600ff, - 0x01000000, 0xb8ef2b05, - 0x806f816f, 0x8e6f826f, - 0x806fff6f, 0x00008000, - 0xbef80080, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xb8ef2985, 0x806f816f, - 0x8e6f836f, 0xb8f92b05, - 0x80798179, 0x8e798279, - 0x80ef796f, 0x866f6f6f, - 0xbf84001a, 0x806fff6f, - 0x00008000, 0xbefc0080, + 0xb8ef2b05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, 0xbf11087c, 0xe0524000, 0x781d0000, 0xe0524100, 0x781d0100, 0xe0524200, 0x781d0200, 0xe0524300, 0x781d0300, 0xbf8c0f70, - 0xd3d94000, 0x18000100, - 0xd3d94001, 0x18000101, - 0xd3d94002, 0x18000102, - 0xd3d94003, 0x18000103, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0x807c847c, 0x8078ff78, 0x00000400, 0xbf0a6f7c, - 0xbf85ffea, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xbf8c0f70, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbf85ffee, 0xb8ef2985, + 0x806f816f, 0x8e6f836f, + 0xb8f92b05, 0x80798179, + 0x8e798279, 0x80ef796f, + 0x866f6f6f, 0xbf84001a, + 0x806fff6f, 0x00008000, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe0070, 0xbeff0071, - 0x866f7bff, 0x000003ff, - 0xb96f4803, 0x866f7bff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2985, 0x806e816e, - 0x8e6e8a6e, 0x8e6e816e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b79, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2985, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b79, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 71b3dc0c7363..96fbb16ceb21 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -33,6 +33,7 @@ * gfx11: * cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3 * sp3 gfx11.sp3 -hex gfx11.hex + * */ #define CHIP_NAVI10 26 @@ -43,22 +44,33 @@ #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID) #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) -#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO) +#define SW_SA_TRAP (ASIC_FAMILY == CHIP_PLUM_BONITO) +#define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger +#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised -var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#define S_COHERENCE glc:1 +#define V_COHERENCE slc:1 glc:1 +#define S_WAITCNT_0 s_waitcnt 0 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_HALT_MASK = 0x2000 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 +var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 +var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 +var S_STATUS_HWREG = HW_REG_STATUS +var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK +var S_STATUS_HALT_MASK = SQ_WAVE_STATUS_HALT_MASK +var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 +var S_SAVE_PC_HI_HT_MASK = 0x01000000 +var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 -var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 -var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 #if ASIC_FAMILY < CHIP_PLUM_BONITO var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 @@ -73,14 +85,17 @@ var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000 +#if ASIC_FAMILY >= CHIP_PLUM_BONITO +var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT = 16 +var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000 +var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT = 17 +var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000 +var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000 +#endif +var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19 @@ -92,6 +107,28 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 +var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT +var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT + +#if ASIC_FAMILY < CHIP_PLUM_BONITO +var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0 +var S_TRAPSTS_RESTORE_PART_3_SIZE = 0 +#else +var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK |\ + SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\ + SQ_WAVE_TRAPSTS_WAVE_START_MASK |\ + SQ_WAVE_TRAPSTS_WAVE_END_MASK |\ + SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT - SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_TRAPSTS_WAVE_START_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT +#endif +var S_TRAPSTS_HWREG = HW_REG_TRAPSTS +var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_TRAPSTS_SAVECTX_MASK +var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT + // bits [31:24] unused by SPI debug data var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31 var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000 @@ -104,8 +141,6 @@ var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC -var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 -var S_SAVE_PC_HI_HT_MASK = 0x01000000 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 @@ -165,6 +200,7 @@ var s_restore_buf_rsrc3 = ttmp11 var s_restore_size = ttmp6 var s_restore_ttmps_lo = s_restore_tmp var s_restore_ttmps_hi = s_restore_alloc_size +var s_restore_spi_init_hi_save = s_restore_exec_hi shader main asic(DEFAULT) @@ -177,13 +213,13 @@ L_JUMP_TO_RESTORE: s_branch L_RESTORE L_SKIP_RESTORE: - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_getreg_b32 s_save_status, hwreg(S_STATUS_HWREG) //save STATUS since we will change SCC // Clear SPI_PRIO: do not save with elevated priority. // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK + s_andn2_b32 s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) #if SW_SA_TRAP // If ttmp1[30] is set then issue s_barrier to unblock dependent waves. @@ -198,7 +234,7 @@ L_TRAP_NO_BARRIER: s_cbranch_scc1 L_CHECK_SAVE #endif - s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + s_and_b32 ttmp2, s_save_status, S_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_HALTED L_HALTED: @@ -207,14 +243,14 @@ L_HALTED: s_cbranch_scc1 L_FETCH_2ND_TRAP L_CHECK_SAVE: - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK s_cbranch_scc1 L_SAVE // Wave is halted but neither host trap nor SAVECTX is raised. // Caused by instruction fetch memory violation. // Spin wait until context saved to prevent interrupt storm. s_sleep 0x10 - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) s_branch L_CHECK_SAVE L_NOT_HALTED: @@ -224,7 +260,7 @@ L_NOT_HALTED: // Check non-maskable exceptions. memory_violation, illegal_instruction // and xnack_error exceptions always cause the wave to enter the trap // handler. - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_NON_MASKABLE_EXCP_MASK s_cbranch_scc1 L_FETCH_2ND_TRAP // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. @@ -248,15 +284,15 @@ L_CHECK_TRAP_ID: s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK s_cbranch_scc1 L_FETCH_2ND_TRAP -if SINGLE_STEP_MISSED_WORKAROUND +#if SINGLE_STEP_MISSED_WORKAROUND // Prioritize single step exception over context save. // Second-level trap will halt wave and RFE, re-entering for SAVECTX. s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK s_cbranch_scc1 L_FETCH_2ND_TRAP -end +#endif - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK s_cbranch_scc1 L_SAVE L_FETCH_2ND_TRAP: @@ -269,7 +305,7 @@ L_FETCH_2ND_TRAP: // ttmp12 holds SQ_WAVE_STATUS #if HAVE_SENDMSG_RTN s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 #else s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) @@ -281,16 +317,16 @@ L_FETCH_2ND_TRAP: s_or_b32 ttmp15, ttmp15, 0xFFFF0000 L_NO_SIGN_EXTEND_TMA: - s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag - s_waitcnt lgkmcnt(0) + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag + S_WAITCNT_0 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK s_or_b32 ttmp11, ttmp11, ttmp2 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA - s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA - s_waitcnt lgkmcnt(0) + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA + S_WAITCNT_0 + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA + S_WAITCNT_0 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set @@ -298,9 +334,13 @@ L_NO_SIGN_EXTEND_TMA: L_NO_NEXT_TRAP: // If not caused by trap then halt wave to prevent re-entry. - s_and_b32 ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK) + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK s_cbranch_scc1 L_TRAP_CASE - s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Host trap will not cause trap re-entry. + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK + s_cbranch_scc1 L_EXIT_TRAP + s_or_b32 s_save_status, s_save_status, S_STATUS_HALT_MASK // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. // Rewind the PC to prevent this from occurring. @@ -310,10 +350,6 @@ L_NO_NEXT_TRAP: s_branch L_EXIT_TRAP L_TRAP_CASE: - // Host trap will not cause trap re-entry. - s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK - s_cbranch_scc1 L_EXIT_TRAP - // Advance past trap instruction to prevent re-entry. s_add_u32 ttmp0, ttmp0, 0x4 s_addc_u32 ttmp1, ttmp1, 0x0 @@ -328,14 +364,22 @@ L_EXIT_TRAP: // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status + s_setreg_b32 hwreg(S_STATUS_HWREG), s_save_status s_rfe_b64 [ttmp0, ttmp1] L_SAVE: + // If VGPRs have been deallocated then terminate the wavefront. + // It has no remaining program to run and cannot save without VGPRs. +#if ASIC_FAMILY == CHIP_PLUM_BONITO + s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: +#endif s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] s_mov_b32 s_save_tmp, 0 - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit #if HAVE_XNACK save_and_clear_ib_sts(s_save_tmp, s_save_trapsts) @@ -360,7 +404,7 @@ L_SLEEP: s_sleep 0x2 s_cbranch_execz L_SLEEP #else - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 #endif // Save first_wave flag so we can clear high bits of save address. @@ -382,7 +426,7 @@ L_SLEEP: s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0xFFFFFFFF - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE v_mov_b32 v0, 0x0 s_mov_b32 exec_lo, s_save_ttmps_lo s_mov_b32 exec_hi, s_save_ttmps_hi @@ -390,7 +434,7 @@ L_SLEEP: // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 - get_wave_size(s_save_ttmps_hi) + get_wave_size2(s_save_ttmps_hi) get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) get_svgpr_size_bytes(s_save_ttmps_hi) s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi @@ -414,15 +458,15 @@ L_SLEEP: s_mov_b32 exec_lo, 0x3FFF s_mov_b32 exec_hi, 0x0 - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE v_readlane_b32 ttmp14, v0, 0xE v_readlane_b32 ttmp15, v0, 0xF s_mov_b32 exec_lo, ttmp14 s_mov_b32 exec_hi, ttmp15 #else - s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE #endif /* setup Resource Contants */ @@ -436,7 +480,7 @@ L_SLEEP: /* global mem offset */ s_mov_b32 s_save_mem_offset, 0x0 - get_wave_size(s_wave_size) + get_wave_size2(s_wave_size) #if HAVE_XNACK // Save and clear vector XNACK state late to free up SGPRs. @@ -460,12 +504,22 @@ L_SAVE_4VGPR_WAVE32: // VGPR Allocated in 4-GPR granularity +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP + + write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + s_branch L_SAVE_HWREG + +L_SAVE_FIRST_VGPRS32_WITH_TCP: +#endif + #if !NO_SQC_STORE - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #endif - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 s_branch L_SAVE_HWREG L_SAVE_4VGPR_WAVE64: @@ -473,12 +527,22 @@ L_SAVE_4VGPR_WAVE64: // VGPR Allocated in 4-GPR granularity +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP + + write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + s_branch L_SAVE_HWREG + +L_SAVE_FIRST_VGPRS64_WITH_TCP: +#endif + #if !NO_SQC_STORE - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #endif - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 /* save HW registers */ @@ -506,7 +570,7 @@ L_SAVE_HWREG: write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) - s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_tmp, hwreg(S_TRAPSTS_HWREG) write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) // Not used on Sienna_Cichlid but keep layout same for debugger. @@ -525,7 +589,7 @@ L_SAVE_HWREG: // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. s_mov_b32 exec_lo, 0xFFFF s_mov_b32 exec_hi, 0x0 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. s_mov_b32 exec_lo, 0xFFFFFFFF @@ -568,7 +632,7 @@ L_SAVE_SGPR_LOOP: s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 s_mov_b32 ttmp13, 0x0 v_mov_b32 v2, 0x0 @@ -589,7 +653,7 @@ L_SAVE_SGPR_SKIP_TCP_STORE: write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) #if NO_SQC_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #else // restore s_save_buf_rsrc0,1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask @@ -619,8 +683,7 @@ L_SAVE_LDS_NORMAL: // first wave do LDS save; - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) @@ -645,16 +708,36 @@ L_SAVE_LDS_NORMAL: s_cbranch_scc1 L_SAVE_LDS_W64 L_SAVE_LDS_W32: +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32 + +L_SAVE_LDS_LOOP_SQC_W32: + ds_read_b32 v1, v0 + S_WAITCNT_0 + + write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes + v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W32 //LDS save is complete? + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_WITH_TCP_W32: +#endif + s_mov_b32 s3, 128 s_nop 0 s_nop 0 s_nop 0 L_SAVE_LDS_LOOP_W32: ds_read_b32 v1, v0 - s_waitcnt 0 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + S_WAITCNT_0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 @@ -663,14 +746,34 @@ L_SAVE_LDS_LOOP_W32: s_branch L_SAVE_LDS_DONE L_SAVE_LDS_W64: +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64 + +L_SAVE_LDS_LOOP_SQC_W64: + ds_read_b32 v1, v0 + S_WAITCNT_0 + + write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes + v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W64 //LDS save is complete? + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_WITH_TCP_W64: +#endif + s_mov_b32 s3, 256 s_nop 0 s_nop 0 s_nop 0 L_SAVE_LDS_LOOP_W64: ds_read_b32 v1, v0 - s_waitcnt 0 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + S_WAITCNT_0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -712,16 +815,35 @@ L_SAVE_VGPR_NORMAL: s_cmp_lt_u32 m0, s_save_alloc_size s_cbranch_scc0 L_SAVE_VGPR_END +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP + +L_SAVE_VGPR_LOOP_SQC_W32: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32 + + s_branch L_SAVE_VGPR_END +#endif + L_SAVE_VGPR_W32_LOOP: v_movrels_b32 v0, v0 //v0 = v[0+m0] v_movrels_b32 v1, v1 //v1 = v[1+m0] v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes @@ -738,16 +860,35 @@ L_SAVE_VGPR_WAVE64: s_cmp_lt_u32 m0, s_save_alloc_size s_cbranch_scc0 L_SAVE_SHARED_VGPR +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP + +L_SAVE_VGPR_LOOP_SQC_W64: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64 + + s_branch L_SAVE_VGPR_END +#endif + L_SAVE_VGPR_W64_LOOP: v_movrels_b32 v0, v0 //v0 = v[0+m0] v_movrels_b32 v1, v1 //v1 = v[1+m0] v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes @@ -765,9 +906,26 @@ L_SAVE_SHARED_VGPR: s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0x00000000 + +#if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP + +L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC: + v_movrels_b32 v0, v0 + + write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 1 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC + + s_branch L_SAVE_VGPR_END +#endif + L_SAVE_SHARED_VGPR_WAVE64_LOOP: v_movrels_b32 v0, v0 //v0 = v[0+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 @@ -785,7 +943,7 @@ L_RESTORE: s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC //determine it is wave32 or wave64 - get_wave_size(s_restore_size) + get_wave_size2(s_restore_size) s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_RESTORE_VGPR @@ -805,8 +963,7 @@ L_RESTORE_LDS_NORMAL: s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) @@ -830,7 +987,7 @@ L_RESTORE_LDS_LOOP_W32: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW #else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset - s_waitcnt vmcnt(0) + S_WAITCNT_0 ds_store_addtid_b32 v0 #endif s_add_u32 m0, m0, 128 // 128 DW @@ -844,7 +1001,7 @@ L_RESTORE_LDS_LOOP_W64: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW #else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset - s_waitcnt vmcnt(0) + S_WAITCNT_0 ds_store_addtid_b32 v0 #endif s_add_u32 m0, m0, 256 // 256 DW @@ -885,11 +1042,11 @@ L_RESTORE_VGPR_NORMAL: s_cbranch_scc0 L_RESTORE_SGPR L_RESTORE_VGPR_WAVE32_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3 + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 v_movreld_b32 v2, v2 @@ -900,11 +1057,11 @@ L_RESTORE_VGPR_WAVE32_LOOP: s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? /* VGPR restore on v0 */ - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3 + S_WAITCNT_0 s_branch L_RESTORE_SGPR @@ -919,11 +1076,11 @@ L_RESTORE_VGPR_WAVE64: s_cbranch_scc0 L_RESTORE_SHARED_VGPR L_RESTORE_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3 + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 v_movreld_b32 v2, v2 @@ -945,8 +1102,8 @@ L_RESTORE_SHARED_VGPR: s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0x00000000 L_RESTORE_SHARED_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 @@ -957,11 +1114,11 @@ L_RESTORE_SHARED_VGPR_WAVE64_LOOP: /* VGPR restore on v0 */ L_RESTORE_V0: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3 + S_WAITCNT_0 /* restore SGPRs */ //will be 2+8+16*6 @@ -978,7 +1135,7 @@ L_RESTORE_SGPR: s_mov_b32 m0, s_sgpr_save_num read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -987,7 +1144,7 @@ L_RESTORE_SGPR: s_movreld_b64 s2, s2 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -999,7 +1156,7 @@ L_RESTORE_SGPR: L_RESTORE_SGPR_LOOP: read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1041,12 +1198,12 @@ L_RESTORE_HWREG: read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + S_WAITCNT_0 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch @@ -1054,16 +1211,21 @@ L_RESTORE_HWREG: s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - #if HAVE_XNACK s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask #endif - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + // {TRAPSTS/EXCP_FLAG_PRIV}.SAVE_CONTEXT and HOST_TRAP may have changed. + // Only restore the other fields to avoid clobbering them. + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, 0, S_TRAPSTS_RESTORE_PART_1_SIZE), s_restore_trapsts + s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_2_SHIFT, S_TRAPSTS_RESTORE_PART_2_SIZE), s_restore_trapsts + +if S_TRAPSTS_RESTORE_PART_3_SIZE > 0 + s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_3_SHIFT - S_TRAPSTS_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_3_SHIFT, S_TRAPSTS_RESTORE_PART_3_SIZE), s_restore_trapsts +end + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic @@ -1075,10 +1237,10 @@ L_RESTORE_HWREG: s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 - s_waitcnt lgkmcnt(0) + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE + S_WAITCNT_0 #if HAVE_XNACK restore_ib_sts(s_restore_tmp, s_restore_m0) @@ -1100,7 +1262,8 @@ L_RESTORE_HWREG: L_RETURN_WITHOUT_PRIV: #endif - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + s_setreg_b32 hwreg(S_STATUS_HWREG), s_restore_status // SCC is included, which is changed by previous salu + s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: @@ -1115,7 +1278,7 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) #else s_mov_b32 exec_lo, m0 s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo #endif @@ -1130,10 +1293,10 @@ function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) s_add_u32 ttmp13, ttmp13, 0x1 end #else - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE + s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE + s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE + s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 #endif @@ -1147,40 +1310,72 @@ function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) s_add_u32 ttmp13, ttmp13, 0x1 end #else - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE + s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE + s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 #endif end function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE s_add_u32 s_mem_offset, s_mem_offset, 4 end function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*16 - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE end function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*8 - s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE end function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*4 - s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE +end + +#if SAVE_AFTER_XNACK_ERROR +function check_if_tcp_store_ok + // If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp + +L_TCP_STORE_CHECK_DONE: end +function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset) + s_mov_b32 s4, 0 + +L_WRITE_VGPR_LANE_LOOP: + for var lane = 0; lane < 4; ++lane + v_readlane_b32 s[lane], vgpr, s4 + s_add_u32 s4, s4, 1 + end + + s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 -function get_lds_size_bytes(s_lds_size_byte) - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW + s_add_u32 s_mem_offset, s_mem_offset, 0x10 + s_cmp_eq_u32 s4, n_lanes + s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP +end + +function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset) + for var vgpr = 0; vgpr < n_vgprs; ++vgpr + write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset) + end end +function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset) + for var vgpr = 0; vgpr < n_vgprs; ++vgpr + write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset) + end +end +#endif + function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 @@ -1206,11 +1401,12 @@ function get_hwreg_size_bytes return 128 end -function get_wave_size(s_reg) +function get_wave_size2(s_reg) s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE end +#if HAVE_XNACK function save_and_clear_ib_sts(tmp1, tmp2) // Preserve and clear scalar XNACK state before issuing scalar loads. // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into @@ -1235,3 +1431,4 @@ function restore_ib_sts(tmp1, tmp2) s_or_b32 tmp1, tmp1, tmp2 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 end +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm new file mode 100644 index 000000000000..5a1a1b1f897f --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -0,0 +1,1136 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* To compile this assembly code: + * + * gfx12: + * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx12.asm -P -o gfx12.sp3 + * sp3 gfx12.sp3 -hex gfx12.hex + */ + +#define CHIP_GFX12 37 + +#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised +#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12) + +var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 +var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 +var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 +var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15 +var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 +var SQ_WAVE_STATUS_WAVE64_SIZE = 1 +var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 +var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK +var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 + +var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF +var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 +var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 +var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 +var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 +var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT = 7 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 +var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 +var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 +var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200 + +var SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK= SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT +var BARRIER_STATE_SIGNAL_OFFSET = 16 +var BARRIER_STATE_VALID_OFFSET = 0 + +var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 +var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 + +// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] +// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000 +var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31 + +var s_sgpr_save_num = 108 + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi +var s_save_pc_lo = ttmp0 +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_state_priv = ttmp12 +var s_save_excp_flag_priv = ttmp15 +var s_save_xnack_mask = s_save_excp_flag_priv +var s_wave_size = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 +var s_save_mem_offset = ttmp4 +var s_save_alloc_size = s_save_excp_flag_priv +var s_save_tmp = ttmp14 +var s_save_m0 = ttmp5 +var s_save_ttmps_lo = s_save_tmp +var s_save_ttmps_hi = s_save_excp_flag_priv + +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +var S_WAVE_SIZE = 25 + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp2 +var s_restore_mem_offset_save = s_restore_tmp +var s_restore_m0 = s_restore_alloc_size +var s_restore_mode = ttmp7 +var s_restore_flat_scratch = s_restore_tmp +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_state_priv = ttmp14 +var s_restore_excp_flag_priv = ttmp15 +var s_restore_xnack_mask = ttmp13 +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_size = ttmp6 +var s_restore_ttmps_lo = s_restore_tmp +var s_restore_ttmps_hi = s_restore_alloc_size +var s_restore_spi_init_hi_save = s_restore_exec_hi + +shader main + asic(DEFAULT) + type(CS) + wave_size(32) + + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE + +L_SKIP_RESTORE: + s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC + + // Clear SPI_PRIO: do not save with elevated priority. + // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. + s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK + + s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + + s_and_b32 ttmp2, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK + s_cbranch_scc0 L_NOT_HALTED + +L_HALTED: + // Host trap may occur while wave is halted. + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP + +L_CHECK_SAVE: + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK + s_cbranch_scc1 L_SAVE + + // Wave is halted but neither host trap nor SAVECTX is raised. + // Caused by instruction fetch memory violation. + // Spin wait until context saved to prevent interrupt storm. + s_sleep 0x10 + s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + s_branch L_CHECK_SAVE + +L_NOT_HALTED: + // Let second-level handle non-SAVECTX exception or trap. + // Any concurrent SAVECTX will be handled upon re-entry once halted. + + // Check non-maskable exceptions. memory_violation, illegal_instruction + // and xnack_error exceptions always cause the wave to enter the trap + // handler. + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP + + // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. + // Maskable exceptions only cause the wave to enter the trap handler if + // their respective bit in mode.excp_en is set. + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + s_and_b32 ttmp3, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK + s_cbranch_scc0 L_NOT_ADDR_WATCH + s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK + +L_NOT_ADDR_WATCH: + s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL) + s_and_b32 ttmp2, ttmp3, ttmp2 + s_cbranch_scc1 L_FETCH_2ND_TRAP + +L_CHECK_TRAP_ID: + // Check trap_id != 0 + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP + +#if SINGLE_STEP_MISSED_WORKAROUND + // Prioritize single step exception over context save. + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. + // WAVE_TRAP_CTRL is already in ttmp3. + s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP +#endif + + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK + s_cbranch_scc1 L_SAVE + +L_FETCH_2ND_TRAP: + // Read second-level TBA/TMA from first-level TMA and jump if available. + // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) + // ttmp12 holds SQ_WAVE_STATUS + s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) + s_wait_idle + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + + s_bitcmp1_b32 ttmp15, 0xF + s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA + s_or_b32 ttmp15, ttmp15, 0xFFFF0000 +L_NO_SIGN_EXTEND_TMA: + + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag + s_wait_idle + s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT + s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK + s_or_b32 ttmp11, ttmp11, ttmp2 + + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 scope:SCOPE_SYS // second-level TBA + s_wait_idle + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 scope:SCOPE_SYS // second-level TMA + s_wait_idle + + s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] + s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set + s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler + +L_NO_NEXT_TRAP: + // If not caused by trap then halt wave to prevent re-entry. + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK + s_cbranch_scc1 L_TRAP_CASE + + // Host trap will not cause trap re-entry. + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK + s_cbranch_scc1 L_EXIT_TRAP + s_or_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK + + // If the PC points to S_ENDPGM then context save will fail if STATE_PRIV.HALT is set. + // Rewind the PC to prevent this from occurring. + s_sub_u32 ttmp0, ttmp0, 0x8 + s_subb_u32 ttmp1, ttmp1, 0x0 + + s_branch L_EXIT_TRAP + +L_TRAP_CASE: + // Advance past trap instruction to prevent re-entry. + s_add_u32 ttmp0, ttmp0, 0x4 + s_addc_u32 ttmp1, ttmp1, 0x0 + +L_EXIT_TRAP: + s_and_b32 ttmp1, ttmp1, 0xFFFF + + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + + // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. + // Only restore fields which the trap handler changes. + s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ + SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv + + s_rfe_b64 [ttmp0, ttmp1] + +L_SAVE: + // If VGPRs have been deallocated then terminate the wavefront. + // It has no remaining program to run and cannot save without VGPRs. + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: + + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_mov_b32 s_save_tmp, 0 + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + + s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE) + s_wait_idle + + // Save first_wave flag so we can clear high bits of save address. + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT) + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + + // Trap temporaries must be saved via VGPR but all VGPRs are in use. + // There is no ttmp space to hold the resource constant for VGPR save. + // Save v0 by itself since it requires only two SGPRs. + s_mov_b32 s_save_ttmps_lo, exec_lo + s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0xFFFFFFFF + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS + v_mov_b32 v0, 0x0 + s_mov_b32 exec_lo, s_save_ttmps_lo + s_mov_b32 exec_hi, s_save_ttmps_hi + + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 + get_wave_size2(s_save_ttmps_hi) + get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) + get_svgpr_size_bytes(s_save_ttmps_hi) + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi + s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 + + v_writelane_b32 v0, ttmp4, 0x4 + v_writelane_b32 v0, ttmp5, 0x5 + v_writelane_b32 v0, ttmp6, 0x6 + v_writelane_b32 v0, ttmp7, 0x7 + v_writelane_b32 v0, ttmp8, 0x8 + v_writelane_b32 v0, ttmp9, 0x9 + v_writelane_b32 v0, ttmp10, 0xA + v_writelane_b32 v0, ttmp11, 0xB + v_writelane_b32 v0, ttmp13, 0xD + v_writelane_b32 v0, exec_lo, 0xE + v_writelane_b32 v0, exec_hi, 0xF + valu_sgpr_hazard() + + s_mov_b32 exec_lo, 0x3FFF + s_mov_b32 exec_hi, 0x0 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS + v_readlane_b32 ttmp14, v0, 0xE + v_readlane_b32 ttmp15, v0, 0xF + s_mov_b32 exec_lo, ttmp14 + s_mov_b32 exec_hi, ttmp15 + + /* setup Resource Contants */ + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + + s_mov_b32 s_save_m0, m0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 + get_wave_size2(s_wave_size) + + /* save first 4 VGPRs, needed for SGPR save */ + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_4VGPR_WAVE32 +L_ENABLE_SAVE_4VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + s_branch L_SAVE_4VGPR_WAVE64 +L_SAVE_4VGPR_WAVE32: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3 + s_branch L_SAVE_HWREG + +L_SAVE_4VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3 + + /* save HW registers */ + +L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource + v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource + v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store + + // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. + s_barrier_signal -2 + s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp + + s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK + v_writelane_b32 v2, s_save_m0, 0x0 + v_writelane_b32 v2, s_save_pc_lo, 0x1 + v_writelane_b32 v2, s_save_tmp, 0x2 + v_writelane_b32 v2, s_save_exec_lo, 0x3 + v_writelane_b32 v2, s_save_exec_hi, 0x4 + v_writelane_b32 v2, s_save_state_priv, 0x5 + v_writelane_b32 v2, s_save_xnack_mask, 0x7 + valu_sgpr_hazard() + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + v_writelane_b32 v2, s_save_tmp, 0x6 + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE) + v_writelane_b32 v2, s_save_tmp, 0x8 + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) + v_writelane_b32 v2, s_save_tmp, 0x9 + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) + v_writelane_b32 v2, s_save_tmp, 0xA + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + v_writelane_b32 v2, s_save_tmp, 0xB + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_TRAP_CTRL) + v_writelane_b32 v2, s_save_tmp, 0xC + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + v_writelane_b32 v2, s_save_tmp, 0xD + + s_get_barrier_state s_save_tmp, -1 + s_wait_kmcnt (0) + v_writelane_b32 v2, s_save_tmp, 0xE + valu_sgpr_hazard() + + // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. + s_mov_b32 exec_lo, 0xFFFF + s_mov_b32 exec_hi, 0x0 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. + s_mov_b32 exec_lo, 0xFFFFFFFF + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + + // SGPR SR memory offset : size(VGPR)+size(SVGPR) + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states +L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + s_cmp_eq_u32 ttmp13, 0x0 + s_cbranch_scc0 L_WRITE_V2_SECOND_HALF + write_16sgpr_to_v2(s0, 0x0) + s_branch L_SAVE_SGPR_SKIP_TCP_STORE +L_WRITE_V2_SECOND_HALF: + write_16sgpr_to_v2(s0, 0x10) + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 + s_mov_b32 ttmp13, 0x0 + v_mov_b32 v2, 0x0 +L_SAVE_SGPR_SKIP_TCP_STORE: + + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? + + //save the rest 12 SGPR + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + write_12sgpr_to_v2(s0) + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + /* save LDS */ + +L_SAVE_LDS: + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_LDS_NORMAL +L_ENABLE_SAVE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_SAVE_LDS_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + //load 0~63*4(byte address) to vgpr v0 + v_mbcnt_lo_u32_b32 v0, -1, 0 + v_mbcnt_hi_u32_b32 v0, -1, v0 + v_mul_u32_u24 v0, 4, v0 + + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_SAVE_LDS_W64 + +L_SAVE_LDS_W32: + s_mov_b32 s3, 128 + s_nop 0 + s_nop 0 + s_nop 0 +L_SAVE_LDS_LOOP_W32: + ds_read_b32 v1, v0 + s_wait_idle + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 + v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_W64: + s_mov_b32 s3, 256 + s_nop 0 + s_nop 0 + s_nop 0 +L_SAVE_LDS_LOOP_W64: + ds_read_b32 v1, v0 + s_wait_idle + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 + v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? + +L_SAVE_LDS_DONE: + /* save VGPRs - set the Rest VGPRs */ +L_SAVE_VGPR: + // VGPR SR memory offset: 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI + s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_VGPR_NORMAL +L_ENABLE_SAVE_VGPR_EXEC_HI: + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_hi, 0xFFFFFFFF +L_SAVE_VGPR_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64 + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + +L_SAVE_VGPR_W32_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3 + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? + + s_branch L_SAVE_VGPR_END + +L_SAVE_VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_SHARED_VGPR + +L_SAVE_VGPR_W64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3 + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? + +L_SAVE_SHARED_VGPR: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //save shared_vgpr will start from the index of m0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 + +L_SAVE_SHARED_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? + +L_SAVE_VGPR_END: + s_branch L_END_PGM + +L_RESTORE: + /* Setup Resource Contants */ + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + + // Save s_restore_spi_init_hi for later use. + s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi + + //determine it is wave32 or wave64 + get_wave_size2(s_restore_size) + + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ +L_RESTORE_LDS: + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_LDS_NORMAL +L_ENABLE_RESTORE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_RESTORE_LDS_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 + +L_RESTORE_LDS_LOOP_W32: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset + s_wait_idle + ds_store_addtid_b32 v0 + s_add_u32 m0, m0, 128 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? + s_branch L_RESTORE_VGPR + +L_RESTORE_LDS_LOOP_W64: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset + s_wait_idle + ds_store_addtid_b32 v0 + s_add_u32 m0, m0, 256 // 256 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? + + /* restore VGPRs */ +L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_VGPR_NORMAL +L_ENABLE_RESTORE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_RESTORE_VGPR_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 + s_mov_b32 m0, 4 //VGPR initial index value = 4 + s_cmp_lt_u32 m0, s_restore_alloc_size + s_cbranch_scc0 L_RESTORE_SGPR + +L_RESTORE_VGPR_WAVE32_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3 + s_wait_idle + v_movreld_b32 v0, v0 //v[0+m0] = v0 + v_movreld_b32 v1, v1 + v_movreld_b32 v2, v2 + v_movreld_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? + + /* VGPR restore on v0 */ + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3 + s_wait_idle + + s_branch L_RESTORE_SGPR + +L_RESTORE_VGPR_WAVE64: + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 4 + s_cmp_lt_u32 m0, s_restore_alloc_size + s_cbranch_scc0 L_RESTORE_SHARED_VGPR + +L_RESTORE_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3 + s_wait_idle + v_movreld_b32 v0, v0 //v[0+m0] = v0 + v_movreld_b32 v1, v1 + v_movreld_b32 v2, v2 + v_movreld_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + +L_RESTORE_SHARED_VGPR: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //restore shared_vgpr will start from the index of m0 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 +L_RESTORE_SHARED_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS + s_wait_idle + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + + s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! + + /* VGPR restore on v0 */ +L_RESTORE_V0: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3 + s_wait_idle + + /* restore SGPRs */ + //will be 2+8+16*6 + // SGPR SR memory offset : size(VGPR)+size(SVGPR) +L_RESTORE_SGPR: + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_mov_b32 m0, s_sgpr_save_num + + read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + + read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP + + // s_barrier with STATE_PRIV.TRAP_AFTER_INST=1, STATUS.PRIV=1 incorrectly asserts debug exception. + // Clear DEBUG_EN before and restore MODE after the barrier. + s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE), 0 + + /* restore HW registers */ +L_RESTORE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // Restore s_restore_spi_init_hi before the saved value gets clobbered. + s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch + + read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp + + // Only the first wave needs to restore the workgroup barrier. + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + + // Skip over WAVE_STATUS, since there is no state to restore from it + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4 + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + + // extract the saved signal count from s_restore_tmp + s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET + + // We need to call s_barrier_signal repeatedly to restore the signal + // count of the work group barrier. The member count is already + // initialized with the number of waves in the work group. +L_BARRIER_RESTORE_LOOP: + s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + s_barrier_signal -1 + s_add_i32 s_restore_tmp, s_restore_tmp, -1 + s_branch L_BARRIER_RESTORE_LOOP + +L_SKIP_BARRIER_RESTORE: + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + // EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed. + // Only restore the other fields to avoid clobbering them. + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv + s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE), s_restore_excp_flag_priv + s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE), s_restore_excp_flag_priv + + s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode + + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) + get_svgpr_size_bytes(s_restore_ttmps_hi) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS + s_wait_idle + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu + + // Make barrier and LDS state visible to all waves in the group. + // STATE_PRIV.BARRIER_COMPLETE may change after this point. + s_barrier_signal -2 + s_barrier_wait -2 + + s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + +L_END_PGM: + // Make sure that no wave of the workgroup can exit the trap handler + // before the workgroup barrier state is saved. + s_barrier_signal -2 + s_barrier_wait -2 + s_endpgm_saved +end + +function write_16sgpr_to_v2(s, lane_offset) + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], sgpr_idx + lane_offset + end + valu_sgpr_hazard() + s_add_u32 ttmp13, ttmp13, 0x10 +end + +function write_12sgpr_to_v2(s) + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], sgpr_idx + end + valu_sgpr_hazard() +end + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset scope:SCOPE_SYS + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset scope:SCOPE_SYS +end + +function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*8 + s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset scope:SCOPE_SYS +end + +function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*4 + s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset scope:SCOPE_SYS +end + +function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_bitcmp1_b32 s_size, S_WAVE_SIZE + s_cbranch_scc1 L_ENABLE_SHIFT_W64 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) + s_branch L_SHIFT_DONE +L_ENABLE_SHIFT_W64: + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) +L_SHIFT_DONE: +end + +function get_svgpr_size_bytes(s_svgpr_size_byte) + s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) + s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) +end + +function get_sgpr_size_bytes + return 512 +end + +function get_hwreg_size_bytes + return 128 +end + +function get_wave_size2(s_reg) + s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) + s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE +end + +function valu_sgpr_hazard +#if HAVE_VALU_SGPR_HAZARD + for var rep = 0; rep < 8; rep ++ + ds_nop + end +#endif +end diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index bb26338204f4..6869e07a2fff 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -37,17 +37,28 @@ * gc_9_4_3: * cpp -DASIC_FAMILY=GC_9_4_3 cwsr_trap_handler_gfx9.asm -P -o gc_9_4_3.sp3 * sp3 gc_9_4_3.sp3 -hex gc_9_4_3.hex + * + * gc_9_5_0: + * cpp -DASIC_FAMILY=GC_9_5_0 cwsr_trap_handler_gfx9.asm -P -o gc_9_5_0.sp3 + * sp3 gc_9_5_0.sp3 -hex gc_9_5_0.hex */ #define CHIP_VEGAM 18 #define CHIP_ARCTURUS 23 #define CHIP_ALDEBARAN 25 #define CHIP_GC_9_4_3 26 +#define CHIP_GC_9_5_0 27 var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger var SINGLE_STEP_MISSED_WORKAROUND = (ASIC_FAMILY <= CHIP_ALDEBARAN) //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#if ASIC_FAMILY < CHIP_GC_9_4_3 +#define VMEM_MODIFIERS slc:1 glc:1 +#else +#define VMEM_MODIFIERS sc0:1 nt:1 +#endif + /**************************************************************************/ /* variables */ /**************************************************************************/ @@ -62,7 +73,13 @@ var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +#if ASIC_FAMILY >= CHIP_GC_9_5_0 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 11 +var LDS_RESTORE_GRANULARITY_BYTES = 1280 +#else var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var LDS_RESTORE_GRANULARITY_BYTES = 512 +#endif var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 @@ -430,7 +447,9 @@ L_SAVE: s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - + // Clear VSKIP state now that MODE.VSKIP has been saved. + // If user shader set it then vector instructions would be skipped. + s_setvskip 0,0 /* the first wave in the threadgroup */ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit @@ -557,12 +576,21 @@ if SAVE_AFTER_XNACK_ERROR v_lshlrev_b32 v2, 2, v3 L_SAVE_LDS_LOOP_SQC: +#if ASIC_FAMILY < CHIP_GC_9_5_0 ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 s_waitcnt lgkmcnt(0) - write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) v_add_u32 v2, 0x200, v2 +#else + // gfx950 needs to save in multiple of 256 bytes. + ds_read_b32 v0, v2 + s_waitcnt lgkmcnt(0) + write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset) + + v_add_u32 v2, 0x100, v2 +#endif + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC @@ -581,11 +609,14 @@ end L_SAVE_LDS_LOOP_VECTOR: ds_read_b64 v[0:1], v2 //x =LDS[a], byte address s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset VMEM_MODIFIERS offen:1 // s_waitcnt vmcnt(0) // v_add_u32 v2, vcc[0:1], v2, v3 v_add_u32 v2, v2, v3 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size +#if ASIC_FAMILY >= CHIP_GC_9_5_0 + s_mov_b64 exec, vcc +#endif s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR // restore rsrc3 @@ -748,8 +779,13 @@ L_RESTORE: L_RESTORE_LDS_LOOP: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - s_add_u32 m0, m0, 256*2 // 128 DW - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW +#if ASIC_FAMILY >= CHIP_GC_9_5_0 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512 // third 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768 // forth 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024 // fifth 64DW +#endif + s_add_u32 m0, m0, LDS_RESTORE_GRANULARITY_BYTES // 128/320 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES //mem offset increased by 128/320 DW s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? @@ -979,17 +1015,17 @@ L_TCP_STORE_CHECK_DONE: end function write_4vgprs_to_mem(s_rsrc, s_mem_offset) - buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 end function read_4vgprs_from_mem(s_rsrc, s_mem_offset) - buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 s_waitcnt vmcnt(0) end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 80e90fdef291..065d87841459 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -36,7 +36,6 @@ #include <linux/mman.h> #include <linux/ptrace.h> #include <linux/dma-buf.h> -#include <linux/fdtable.h> #include <linux/processor.h> #include "kfd_priv.h" #include "kfd_device_queue_manager.h" @@ -63,8 +62,10 @@ static const struct file_operations kfd_fops = { }; static int kfd_char_dev_major = -1; -static struct class *kfd_class; struct device *kfd_device; +static const struct class kfd_class = { + .name = kfd_dev_name, +}; static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id) { @@ -94,14 +95,13 @@ int kfd_chardev_init(void) if (err < 0) goto err_register_chrdev; - kfd_class = class_create(kfd_dev_name); - err = PTR_ERR(kfd_class); - if (IS_ERR(kfd_class)) + err = class_register(&kfd_class); + if (err) goto err_class_create; - kfd_device = device_create(kfd_class, NULL, - MKDEV(kfd_char_dev_major, 0), - NULL, kfd_dev_name); + kfd_device = device_create(&kfd_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); err = PTR_ERR(kfd_device); if (IS_ERR(kfd_device)) goto err_device_create; @@ -109,7 +109,7 @@ int kfd_chardev_init(void) return 0; err_device_create: - class_destroy(kfd_class); + class_unregister(&kfd_class); err_class_create: unregister_chrdev(kfd_char_dev_major, kfd_dev_name); err_register_chrdev: @@ -118,8 +118,8 @@ err_register_chrdev: void kfd_chardev_exit(void) { - device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0)); - class_destroy(kfd_class); + device_destroy(&kfd_class, MKDEV(kfd_char_dev_major, 0)); + class_unregister(&kfd_class); unregister_chrdev(kfd_char_dev_major, kfd_dev_name); kfd_device = NULL; } @@ -246,14 +246,15 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->priority = args->queue_priority; q_properties->queue_address = args->ring_base_address; q_properties->queue_size = args->ring_size; - q_properties->read_ptr = (uint32_t *) args->read_pointer_address; - q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + q_properties->read_ptr = (void __user *)args->read_pointer_address; + q_properties->write_ptr = (void __user *)args->write_pointer_address; q_properties->eop_ring_buffer_address = args->eop_buffer_address; q_properties->eop_ring_buffer_size = args->eop_buffer_size; q_properties->ctx_save_restore_area_address = args->ctx_save_restore_address; q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; q_properties->ctl_stack_size = args->ctl_stack_size; + q_properties->sdma_engine_id = args->sdma_engine_id; if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ -261,6 +262,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->type = KFD_QUEUE_TYPE_SDMA; else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI) q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID) + q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID; else return -ENOTSUPP; @@ -305,7 +308,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, struct kfd_process_device *pdd; struct queue_properties q_properties; uint32_t doorbell_offset_in_process = 0; - struct amdgpu_bo *wptr_bo = NULL; memset(&q_properties, 0, sizeof(struct queue_properties)); @@ -333,6 +335,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_bind_process; } + if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { + int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) + + kfd_get_num_xgmi_sdma_engines(dev) - 1; + + if (q_properties.sdma_engine_id > max_sdma_eng_id) { + err = -EINVAL; + pr_err("sdma_engine_id %i exceeds maximum id of %i\n", + q_properties.sdma_engine_id, max_sdma_eng_id); + goto err_sdma_engine_id; + } + } + if (!pdd->qpd.proc_doorbells) { err = kfd_alloc_process_doorbells(dev->kfd, pdd); if (err) { @@ -341,48 +355,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, } } - /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work - * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) - */ - if (dev->kfd->shared_resources.enable_mes && - ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) - >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { - struct amdgpu_bo_va_mapping *wptr_mapping; - struct amdgpu_vm *wptr_vm; - - wptr_vm = drm_priv_to_vm(pdd->drm_priv); - err = amdgpu_bo_reserve(wptr_vm->root.bo, false); - if (err) - goto err_wptr_map_gart; - - wptr_mapping = amdgpu_vm_bo_lookup_mapping( - wptr_vm, args->write_pointer_address >> PAGE_SHIFT); - amdgpu_bo_unreserve(wptr_vm->root.bo); - if (!wptr_mapping) { - pr_err("Failed to lookup wptr bo\n"); - err = -EINVAL; - goto err_wptr_map_gart; - } - - wptr_bo = wptr_mapping->bo_va->base.bo; - if (wptr_bo->tbo.base.size > PAGE_SIZE) { - pr_err("Requested GART mapping for wptr bo larger than one page\n"); - err = -EINVAL; - goto err_wptr_map_gart; - } - - err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo); - if (err) { - pr_err("Failed to map wptr bo to GART\n"); - goto err_wptr_map_gart; - } + err = kfd_queue_acquire_buffers(pdd, &q_properties); + if (err) { + pr_debug("failed to acquire user queue buffers\n"); + goto err_acquire_queue_buf; } pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n", p->pasid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo, + err = pqm_create_queue(&p->pqm, dev, &q_properties, &queue_id, NULL, NULL, NULL, &doorbell_offset_in_process); if (err != 0) goto err_create_queue; @@ -416,9 +399,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return 0; err_create_queue: - if (wptr_bo) - amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); -err_wptr_map_gart: + kfd_queue_unref_bo_vas(pdd, &q_properties); + kfd_queue_release_buffers(pdd, &q_properties); +err_acquire_queue_buf: +err_sdma_engine_id: err_bind_process: err_pdd: mutex_unlock(&p->mutex); @@ -778,8 +762,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, * nodes, but not more than args->num_of_nodes as that is * the amount of memory allocated by user */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); + pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures), + GFP_KERNEL); if (!pa) return -ENOMEM; @@ -1138,7 +1122,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, goto err_unlock; } offset = dev->adev->rmmio_remap.bus_addr; - if (!offset) { + if (!offset || (PAGE_SIZE > 4096)) { err = -ENOMEM; goto err_unlock; } @@ -1164,7 +1148,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) size >>= 1; - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size)); + atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage); } mutex_unlock(&p->mutex); @@ -1235,7 +1219,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); + atomic64_sub(size, &pdd->vram_usage); err_unlock: err_pdd: @@ -1416,8 +1400,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); if (err) { - pr_err("Failed to unmap from gpu %d/%d\n", - i, args->n_devices); + pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices); goto unmap_memory_from_gpu_failed; } args->n_success = i+1; @@ -1522,7 +1505,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, /* Find a KFD GPU device that supports the get_dmabuf_info query */ for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) - if (dev) + if (dev && !kfd_devcgroup_check_permission(dev)) break; if (!dev) return -EINVAL; @@ -1544,7 +1527,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, if (xcp_id >= 0) args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id; else - args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + args->gpu_id = dev->id; args->flags = flags; /* Copy metadata buffer to user mode */ @@ -1851,7 +1834,8 @@ static uint32_t get_process_num_bos(struct kfd_process *p) } static int criu_get_prime_handle(struct kgd_mem *mem, - int flags, u32 *shared_fd) + int flags, u32 *shared_fd, + struct file **file) { struct dma_buf *dmabuf; int ret; @@ -1862,13 +1846,14 @@ static int criu_get_prime_handle(struct kgd_mem *mem, return ret; } - ret = dma_buf_fd(dmabuf, flags); + ret = get_unused_fd_flags(flags); if (ret < 0) { pr_err("dmabuf create fd failed, ret:%d\n", ret); goto out_free_dmabuf; } *shared_fd = ret; + *file = dmabuf->file; return 0; out_free_dmabuf: @@ -1876,6 +1861,25 @@ out_free_dmabuf: return ret; } +static void commit_files(struct file **files, + struct kfd_criu_bo_bucket *bo_buckets, + unsigned int count, + int err) +{ + while (count--) { + struct file *file = files[count]; + + if (!file) + continue; + if (err) { + fput(file); + put_unused_fd(bo_buckets[count].dmabuf_fd); + } else { + fd_install(bo_buckets[count].dmabuf_fd, file); + } + } +} + static int criu_checkpoint_bos(struct kfd_process *p, uint32_t num_bos, uint8_t __user *user_bos, @@ -1884,6 +1888,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, { struct kfd_criu_bo_bucket *bo_buckets; struct kfd_criu_bo_priv_data *bo_privs; + struct file **files = NULL; int ret = 0, pdd_index, bo_index = 0, id; void *mem; @@ -1897,6 +1902,12 @@ static int criu_checkpoint_bos(struct kfd_process *p, goto exit; } + files = kvzalloc(num_bos * sizeof(struct file *), GFP_KERNEL); + if (!files) { + ret = -ENOMEM; + goto exit; + } + for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) { struct kfd_process_device *pdd = p->pdds[pdd_index]; struct amdgpu_bo *dumper_bo; @@ -1907,11 +1918,6 @@ static int criu_checkpoint_bos(struct kfd_process *p, struct kfd_criu_bo_priv_data *bo_priv; int i, dev_idx = 0; - if (!mem) { - ret = -ENOMEM; - goto exit; - } - kgd_mem = (struct kgd_mem *)mem; dumper_bo = kgd_mem->bo; @@ -1944,7 +1950,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, ret = criu_get_prime_handle(kgd_mem, bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0, - &bo_bucket->dmabuf_fd); + &bo_bucket->dmabuf_fd, &files[bo_index]); if (ret) goto exit; } else { @@ -1962,7 +1968,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); for (i = 0; i < p->n_pdds; i++) { - if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->drm_priv, kgd_mem)) bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; } @@ -1995,12 +2001,8 @@ static int criu_checkpoint_bos(struct kfd_process *p, *priv_offset += num_bos * sizeof(*bo_privs); exit: - while (ret && bo_index--) { - if (bo_buckets[bo_index].alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) - close_fd(bo_buckets[bo_index].dmabuf_fd); - } - + commit_files(files, bo_buckets, bo_index, ret); + kvfree(files); kvfree(bo_buckets); kvfree(bo_privs); return ret; @@ -2306,7 +2308,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, return -EINVAL; } offset = pdd->dev->adev->rmmio_remap.bus_addr; - if (!offset) { + if (!offset || (PAGE_SIZE > 4096)) { pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n"); return -ENOMEM; } @@ -2345,14 +2347,15 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { bo_bucket->restored_offset = offset; /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + atomic64_add(bo_bucket->size, &pdd->vram_usage); } return 0; } static int criu_restore_bo(struct kfd_process *p, struct kfd_criu_bo_bucket *bo_bucket, - struct kfd_criu_bo_priv_data *bo_priv) + struct kfd_criu_bo_priv_data *bo_priv, + struct file **file) { struct kfd_process_device *pdd; struct kgd_mem *kgd_mem; @@ -2404,7 +2407,7 @@ static int criu_restore_bo(struct kfd_process *p, if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { ret = criu_get_prime_handle(kgd_mem, DRM_RDWR, - &bo_bucket->dmabuf_fd); + &bo_bucket->dmabuf_fd, file); if (ret) return ret; } else { @@ -2421,6 +2424,7 @@ static int criu_restore_bos(struct kfd_process *p, { struct kfd_criu_bo_bucket *bo_buckets = NULL; struct kfd_criu_bo_priv_data *bo_privs = NULL; + struct file **files = NULL; int ret = 0; uint32_t i = 0; @@ -2434,6 +2438,12 @@ static int criu_restore_bos(struct kfd_process *p, if (!bo_buckets) return -ENOMEM; + files = kvzalloc(args->num_bos * sizeof(struct file *), GFP_KERNEL); + if (!files) { + ret = -ENOMEM; + goto exit; + } + ret = copy_from_user(bo_buckets, (void __user *)args->bos, args->num_bos * sizeof(*bo_buckets)); if (ret) { @@ -2459,7 +2469,7 @@ static int criu_restore_bos(struct kfd_process *p, /* Create and map new BOs */ for (; i < args->num_bos; i++) { - ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]); + ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i], &files[i]); if (ret) { pr_debug("Failed to restore BO[%d] ret%d\n", i, ret); goto exit; @@ -2474,11 +2484,8 @@ static int criu_restore_bos(struct kfd_process *p, ret = -EFAULT; exit: - while (ret && i--) { - if (bo_buckets[i].alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) - close_fd(bo_buckets[i].dmabuf_fd); - } + commit_files(files, bo_buckets, i, ret); + kvfree(files); kvfree(bo_buckets); kvfree(bo_privs); return ret; @@ -2935,6 +2942,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v if (IS_ERR_OR_NULL(target)) { pr_debug("Cannot find process PID %i to debug\n", args->pid); r = target ? PTR_ERR(target) : -ESRCH; + target = NULL; goto out; } @@ -3347,6 +3355,9 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process, if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; + if (PAGE_SIZE > 4096) + return -EINVAL; + address = dev->adev->rmmio_remap.bus_addr; vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index cd8e459201f1..693469c18c60 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -28,6 +28,7 @@ #include "kfd_topology.h" #include "amdgpu.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_xgmi.h" /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. * GPU processor ID are expressed with Bit[31]=1. @@ -55,6 +56,7 @@ static struct kfd_gpu_cache_info kaveri_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -64,6 +66,7 @@ static struct kfd_gpu_cache_info kaveri_cache_info[] = { /* Scalar L1 Instruction Cache (in SQC module) per bank */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -73,6 +76,7 @@ static struct kfd_gpu_cache_info kaveri_cache_info[] = { /* Scalar L1 Data Cache (in SQC module) per bank */ .cache_size = 8, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -88,6 +92,7 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -95,8 +100,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { }, { /* Scalar L1 Instruction Cache (in SQC module) per bank */ - .cache_size = 8, + .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -104,8 +110,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { }, { /* Scalar L1 Data Cache (in SQC module) per bank. */ - .cache_size = 4, + .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -135,6 +142,7 @@ static struct kfd_gpu_cache_info vega10_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -144,6 +152,7 @@ static struct kfd_gpu_cache_info vega10_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -153,6 +162,7 @@ static struct kfd_gpu_cache_info vega10_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -162,6 +172,7 @@ static struct kfd_gpu_cache_info vega10_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 4096, .cache_level = 2, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -174,6 +185,7 @@ static struct kfd_gpu_cache_info raven_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -183,6 +195,7 @@ static struct kfd_gpu_cache_info raven_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -192,6 +205,7 @@ static struct kfd_gpu_cache_info raven_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -201,6 +215,7 @@ static struct kfd_gpu_cache_info raven_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 1024, .cache_level = 2, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -213,6 +228,7 @@ static struct kfd_gpu_cache_info renoir_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -222,6 +238,7 @@ static struct kfd_gpu_cache_info renoir_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -231,6 +248,7 @@ static struct kfd_gpu_cache_info renoir_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -240,6 +258,7 @@ static struct kfd_gpu_cache_info renoir_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 1024, .cache_level = 2, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -252,6 +271,7 @@ static struct kfd_gpu_cache_info vega12_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -261,6 +281,7 @@ static struct kfd_gpu_cache_info vega12_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -270,6 +291,7 @@ static struct kfd_gpu_cache_info vega12_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -279,6 +301,7 @@ static struct kfd_gpu_cache_info vega12_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 2048, .cache_level = 2, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -291,6 +314,7 @@ static struct kfd_gpu_cache_info vega20_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -300,6 +324,7 @@ static struct kfd_gpu_cache_info vega20_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -309,6 +334,7 @@ static struct kfd_gpu_cache_info vega20_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -318,6 +344,7 @@ static struct kfd_gpu_cache_info vega20_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 8192, .cache_level = 2, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -330,6 +357,7 @@ static struct kfd_gpu_cache_info aldebaran_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -339,6 +367,7 @@ static struct kfd_gpu_cache_info aldebaran_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -348,6 +377,7 @@ static struct kfd_gpu_cache_info aldebaran_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -357,6 +387,7 @@ static struct kfd_gpu_cache_info aldebaran_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 8192, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -369,6 +400,7 @@ static struct kfd_gpu_cache_info navi10_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -378,6 +410,7 @@ static struct kfd_gpu_cache_info navi10_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -387,6 +420,7 @@ static struct kfd_gpu_cache_info navi10_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -396,6 +430,7 @@ static struct kfd_gpu_cache_info navi10_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -405,6 +440,7 @@ static struct kfd_gpu_cache_info navi10_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 4096, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -417,6 +453,7 @@ static struct kfd_gpu_cache_info vangogh_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -426,6 +463,7 @@ static struct kfd_gpu_cache_info vangogh_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -435,6 +473,7 @@ static struct kfd_gpu_cache_info vangogh_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -444,6 +483,7 @@ static struct kfd_gpu_cache_info vangogh_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -453,6 +493,7 @@ static struct kfd_gpu_cache_info vangogh_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 1024, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -465,6 +506,7 @@ static struct kfd_gpu_cache_info navi14_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -474,6 +516,7 @@ static struct kfd_gpu_cache_info navi14_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -483,6 +526,7 @@ static struct kfd_gpu_cache_info navi14_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -492,6 +536,7 @@ static struct kfd_gpu_cache_info navi14_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -501,6 +546,7 @@ static struct kfd_gpu_cache_info navi14_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 2048, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -513,6 +559,7 @@ static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -522,6 +569,7 @@ static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -531,6 +579,7 @@ static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -540,6 +589,7 @@ static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -549,6 +599,7 @@ static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 4096, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -558,6 +609,7 @@ static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { /* L3 Data Cache per GPU */ .cache_size = 128*1024, .cache_level = 3, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -570,6 +622,7 @@ static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -579,6 +632,7 @@ static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -588,6 +642,7 @@ static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -597,6 +652,7 @@ static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -606,6 +662,7 @@ static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 3072, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -615,6 +672,7 @@ static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { /* L3 Data Cache per GPU */ .cache_size = 96*1024, .cache_level = 3, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -627,6 +685,7 @@ static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -636,6 +695,7 @@ static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -645,6 +705,7 @@ static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -654,6 +715,7 @@ static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -663,6 +725,7 @@ static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 2048, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -672,6 +735,7 @@ static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { /* L3 Data Cache per GPU */ .cache_size = 32*1024, .cache_level = 3, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -684,6 +748,7 @@ static struct kfd_gpu_cache_info beige_goby_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -693,6 +758,7 @@ static struct kfd_gpu_cache_info beige_goby_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -702,6 +768,7 @@ static struct kfd_gpu_cache_info beige_goby_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -711,6 +778,7 @@ static struct kfd_gpu_cache_info beige_goby_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -720,6 +788,7 @@ static struct kfd_gpu_cache_info beige_goby_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 1024, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -729,6 +798,7 @@ static struct kfd_gpu_cache_info beige_goby_cache_info[] = { /* L3 Data Cache per GPU */ .cache_size = 16*1024, .cache_level = 3, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -741,6 +811,7 @@ static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -750,6 +821,7 @@ static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -759,6 +831,7 @@ static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -768,6 +841,7 @@ static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -777,6 +851,7 @@ static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 2048, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -789,6 +864,7 @@ static struct kfd_gpu_cache_info gfx1037_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -798,6 +874,7 @@ static struct kfd_gpu_cache_info gfx1037_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -807,6 +884,7 @@ static struct kfd_gpu_cache_info gfx1037_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -816,6 +894,7 @@ static struct kfd_gpu_cache_info gfx1037_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -825,6 +904,7 @@ static struct kfd_gpu_cache_info gfx1037_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 256, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -837,6 +917,7 @@ static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -846,6 +927,7 @@ static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -855,6 +937,7 @@ static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -864,6 +947,7 @@ static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -873,6 +957,7 @@ static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 256, .cache_level = 2, + .cache_line_size = 128, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -885,6 +970,7 @@ static struct kfd_gpu_cache_info dummy_cache_info[] = { /* TCP L1 Cache per CU */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -894,6 +980,7 @@ static struct kfd_gpu_cache_info dummy_cache_info[] = { /* Scalar L1 Instruction Cache per SQC */ .cache_size = 32, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -903,6 +990,7 @@ static struct kfd_gpu_cache_info dummy_cache_info[] = { /* Scalar L1 Data Cache per SQC */ .cache_size = 16, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -912,6 +1000,7 @@ static struct kfd_gpu_cache_info dummy_cache_info[] = { /* GL1 Data Cache per SA */ .cache_size = 128, .cache_level = 1, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -921,6 +1010,7 @@ static struct kfd_gpu_cache_info dummy_cache_info[] = { /* L2 Data Cache per GPU (Total Tex Cache) */ .cache_size = 2048, .cache_level = 2, + .cache_line_size = 64, .flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE), @@ -1333,6 +1423,7 @@ err: static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, + bool cache_line_size_missing, struct kfd_gpu_cache_info *pcache_info) { struct amdgpu_device *adev = kdev->adev; @@ -1345,7 +1436,10 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); - pcache_info[0].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; + pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; + pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* Scalar L1 Instruction Cache per SQC */ @@ -1357,6 +1451,9 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; + pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* Scalar L1 Data Cache per SQC */ @@ -1367,6 +1464,9 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; + pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 64; i++; } /* GL1 Data Cache per SA */ @@ -1379,6 +1479,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; + if (cache_line_size_missing) + pcache_info[i].cache_line_size = 128; i++; } /* L2 Data Cache per GPU (Total Tex Cache) */ @@ -1389,6 +1491,9 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; + pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* L3 Data Cache per GPU */ @@ -1399,6 +1504,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; + pcache_info[i].cache_line_size = 64; i++; } return i; @@ -1414,6 +1520,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gfx.config.gc_tcp_size_per_cu) { pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu; pcache_info[i].cache_level = 1; + /* Cacheline size not available in IP discovery for gc943,gc944 */ + pcache_info[i].cache_line_size = 128; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1425,6 +1533,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, pcache_info[i].cache_size = adev->gfx.config.gc_l1_instruction_cache_size_per_sqc; pcache_info[i].cache_level = 1; + pcache_info[i].cache_line_size = 64; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1435,6 +1544,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) { pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc; pcache_info[i].cache_level = 1; + pcache_info[i].cache_line_size = 64; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1445,6 +1555,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gfx.config.gc_tcc_size) { pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size; pcache_info[i].cache_level = 2; + pcache_info[i].cache_line_size = 128; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1455,6 +1566,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gmc.mall_size) { pcache_info[i].cache_size = adev->gmc.mall_size / 1024; pcache_info[i].cache_level = 3; + pcache_info[i].cache_line_size = 64; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1467,6 +1579,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info) { int num_of_cache_types = 0; + bool cache_line_size_missing = false; switch (kdev->adev->asic_type) { case CHIP_KAVERI: @@ -1525,6 +1638,8 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info); break; case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + case IP_VERSION(9, 5, 0): num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd, *pcache_info); @@ -1587,8 +1702,19 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc case IP_VERSION(11, 0, 3): case IP_VERSION(11, 0, 4): case IP_VERSION(11, 5, 0): + case IP_VERSION(11, 5, 1): + case IP_VERSION(11, 5, 2): + /* Cacheline size not available in IP discovery for gc11. + * kfd_fill_gpu_cache_info_from_gfx_config to hard code it + */ + cache_line_size_missing = true; + fallthrough; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): num_of_cache_types = - kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, *pcache_info); + kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, + cache_line_size_missing, + *pcache_info); break; default: *pcache_info = dummy_cache_info; @@ -2120,9 +2246,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * Modify length and total_entries as subunits are added. */ avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - memset(crat_table, 0, sizeof(struct crat_header)); memcpy(&crat_table->signature, CRAT_SIGNATURE, @@ -2136,9 +2259,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * First fill in the sub type header and then sub type data */ avail_size -= sizeof(struct crat_subtype_computeunit); - if (avail_size < 0) - return -ENOMEM; - sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); @@ -2235,6 +2355,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, continue; if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id) continue; + if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev)) + continue; sub_type_hdr = (typeof(sub_type_hdr))( (char *)sub_type_hdr + sizeof(struct crat_subtype_iolink)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 74c2d7a0d628..a8ca7ecb6d27 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -42,8 +42,6 @@ #define CRAT_OEMTABLEID_LENGTH 8 #define CRAT_RESERVED_LENGTH 6 -#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) - /* Compute Unit flags */ #define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ #define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ @@ -303,6 +301,7 @@ struct kfd_node; struct kfd_gpu_cache_info { uint32_t cache_size; uint32_t cache_level; + uint32_t cache_line_size; uint32_t flags; /* Indicates how many Compute Units share this cache * within a SA. Value = 1 indicates the cache is not shared diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index 9ec750666382..a8abc3091801 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -25,6 +25,7 @@ #include "kfd_topology.h" #include <linux/file.h> #include <uapi/linux/kfd_ioctl.h> +#include <uapi/linux/kfd_sysfs.h> #define MAX_WATCH_ADDRESSES 4 @@ -103,7 +104,8 @@ void debug_event_write_work_handler(struct work_struct *work) struct kfd_process, debug_event_workarea); - kernel_write(process->dbg_ev_file, &write_data, 1, &pos); + if (process->debug_trap_enabled && process->dbg_ev_file) + kernel_write(process->dbg_ev_file, &write_data, 1, &pos); } /* update process/device/queue exception status, write to descriptor @@ -348,10 +350,27 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) { uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; uint32_t flags = pdd->process->dbg_flags; + struct amdgpu_device *adev = pdd->dev->adev; + int r; if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) return 0; + if (!pdd->proc_ctx_cpu_ptr) { + r = amdgpu_amdkfd_alloc_gtt_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (r) { + dev_err(adev->dev, + "failed to allocate process context bo\n"); + return r; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, pdd->watch_points, flags, sq_trap_en); } @@ -363,47 +382,47 @@ static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_i *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; - spin_lock(&pdd->dev->kfd->watch_points_lock); + spin_lock(&pdd->dev->watch_points_lock); for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { /* device watchpoint in use so skip */ - if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) + if ((pdd->dev->alloc_watch_ids >> i) & 0x1) continue; pdd->alloc_watch_ids |= 0x1 << i; - pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; + pdd->dev->alloc_watch_ids |= 0x1 << i; *watch_id = i; - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); return 0; } - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); return -ENOMEM; } static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) { - spin_lock(&pdd->dev->kfd->watch_points_lock); + spin_lock(&pdd->dev->watch_points_lock); /* process owns device watch point so safe to clear */ if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { pdd->alloc_watch_ids &= ~(0x1 << watch_id); - pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); + pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id); } - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); } static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) { bool owns_watch_id = false; - spin_lock(&pdd->dev->kfd->watch_points_lock); + spin_lock(&pdd->dev->watch_points_lock); owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && ((pdd->alloc_watch_ids >> watch_id) & 0x1); - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); return owns_watch_id; } @@ -497,14 +516,24 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) int i, r = 0, rewind_count = 0; for (i = 0; i < target->n_pdds; i++) { - if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) && + struct kfd_topology_device *topo_dev = + kfd_topology_device_by_id(target->pdds[i]->dev->id); + uint32_t caps = topo_dev->node_props.capability; + + if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { *flags = prev_flags; return -EACCES; } + + if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) && + (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) { + *flags = prev_flags; + return -EACCES; + } } - target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; + target->dbg_flags = *flags; *flags = prev_flags; for (i = 0; i < target->n_pdds; i++) { struct kfd_process_device *pdd = target->pdds[i]; @@ -645,6 +674,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target) else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; + cancel_work_sync(&target->debug_event_workarea); fput(target->dbg_ev_file); target->dbg_ev_file = NULL; @@ -1018,12 +1048,14 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target, uint32_t *entry_size) { struct kfd_dbg_device_info_entry device_info; - uint32_t tmp_entry_size = *entry_size, tmp_num_devices; + uint32_t tmp_entry_size, tmp_num_devices; int i, r = 0; if (!(target && user_info && number_of_device_infos && entry_size)) return -EINVAL; + tmp_entry_size = *entry_size; + tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds); *number_of_device_infos = target->n_pdds; *entry_size = min_t(size_t, *entry_size, sizeof(device_info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h index fd0ff64d4184..27aa1a5b120f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h @@ -78,6 +78,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev) { return (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) || KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0) || KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)); } @@ -134,6 +136,7 @@ static inline bool kfd_dbg_has_ttmps_always_setup(struct kfd_node *dev) KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 2)) || (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) && KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0) && - (dev->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 70); + (dev->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 70) || + (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0a9cf9dfc224..a29374c86405 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -56,6 +56,7 @@ extern const struct kfd2kgd_calls gc_9_4_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v11_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v12_kfd2kgd; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); @@ -83,6 +84,8 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(4, 2, 2):/* ARCTURUS */ case IP_VERSION(4, 4, 0):/* ALDEBARAN */ case IP_VERSION(4, 4, 2): + case IP_VERSION(4, 4, 5): + case IP_VERSION(4, 4, 4): case IP_VERSION(5, 0, 0):/* NAVI10 */ case IP_VERSION(5, 0, 1):/* CYAN_SKILLFISH */ case IP_VERSION(5, 0, 2):/* NAVI14 */ @@ -96,6 +99,10 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 2): case IP_VERSION(6, 0, 3): case IP_VERSION(6, 1, 0): + case IP_VERSION(6, 1, 1): + case IP_VERSION(6, 1, 2): + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): kfd->device_info.num_sdma_queues_per_engine = 8; break; default: @@ -113,6 +120,10 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 2): case IP_VERSION(6, 0, 3): case IP_VERSION(6, 1, 0): + case IP_VERSION(6, 1, 1): + case IP_VERSION(6, 1, 2): + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): /* Reserve 1 for paging and 1 for gfx */ kfd->device_info.num_reserved_sdma_queues_per_engine = 2; /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */ @@ -141,6 +152,8 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) kfd->device_info.event_interrupt_class = &event_interrupt_class_v9; break; case IP_VERSION(9, 4, 3): /* GC 9.4.3 */ + case IP_VERSION(9, 4, 4): /* GC 9.4.4 */ + case IP_VERSION(9, 5, 0): /* GC 9.5.0 */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v9_4_3; break; @@ -165,6 +178,13 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) case IP_VERSION(11, 0, 3): case IP_VERSION(11, 0, 4): case IP_VERSION(11, 5, 0): + case IP_VERSION(11, 5, 1): + case IP_VERSION(11, 5, 2): + kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; + break; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + /* GFX12_TODO: Change to v12 version. */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; default: @@ -217,6 +237,11 @@ static void kfd_device_info_init(struct kfd_dev *kfd, */ kfd->device_info.needs_pci_atomics = true; kfd->device_info.no_atomic_fw_version = kfd->adev->gfx.rs64_enable ? 509 : 0; + } else if (gc_version < IP_VERSION(13, 0, 0)) { + kfd->device_info.needs_pci_atomics = true; + kfd->device_info.no_atomic_fw_version = 2090; + } else { + kfd->device_info.needs_pci_atomics = true; } } else { kfd->device_info.doorbell_size = 4; @@ -329,6 +354,14 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) : 90401; f2g = &gc_9_4_3_kfd2kgd; break; + case IP_VERSION(9, 4, 4): + gfx_target_version = 90402; + f2g = &gc_9_4_3_kfd2kgd; + break; + case IP_VERSION(9, 5, 0): + gfx_target_version = 90500; + f2g = &gc_9_4_3_kfd2kgd; + break; /* Navi10 */ case IP_VERSION(10, 1, 10): gfx_target_version = 100100; @@ -405,21 +438,30 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 0, 3): - if ((adev->pdev->device == 0x7460 && - adev->pdev->revision == 0x00) || - (adev->pdev->device == 0x7461 && - adev->pdev->revision == 0x00)) - /* Note: Compiler version is 11.0.5 while HW version is 11.0.3 */ - gfx_target_version = 110005; - else - /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ - gfx_target_version = 110001; + /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ + gfx_target_version = 110001; f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 5, 0): gfx_target_version = 110500; f2g = &gfx_v11_kfd2kgd; break; + case IP_VERSION(11, 5, 1): + gfx_target_version = 110501; + f2g = &gfx_v11_kfd2kgd; + break; + case IP_VERSION(11, 5, 2): + gfx_target_version = 110502; + f2g = &gfx_v11_kfd2kgd; + break; + case IP_VERSION(12, 0, 0): + gfx_target_version = 120000; + f2g = &gfx_v12_kfd2kgd; + break; + case IP_VERSION(12, 0, 1): + gfx_target_version = 120001; + f2g = &gfx_v12_kfd2kgd; + break; default: break; } @@ -428,12 +470,12 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) if (!f2g) { if (amdgpu_ip_version(adev, GC_HWIP, 0)) - dev_err(kfd_device, + dev_info(kfd_device, "GC IP %06x %s not supported in kfd\n", amdgpu_ip_version(adev, GC_HWIP, 0), vf ? "VF" : ""); else - dev_err(kfd_device, "%s %s not supported in kfd\n", + dev_info(kfd_device, "%s %s not supported in kfd\n", amdgpu_asic_name[adev->asic_type], vf ? "VF" : ""); return NULL; } @@ -459,37 +501,56 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) { if (cwsr_enable && kfd->device_info.supports_cwsr) { if (KFD_GC_VERSION(kfd) < IP_VERSION(9, 0, 1)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx8_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 1)) { - BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_arcturus_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_arcturus_hex); } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 2)) { - BUILD_BUG_ON(sizeof(cwsr_trap_aldebaran_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_aldebaran_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_aldebaran_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_aldebaran_hex); - } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) > PAGE_SIZE); + } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 4)) { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx9_4_3_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_4_3_hex); + } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 5, 0)) { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_5_0_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx9_5_0_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_5_0_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 1, 1)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 3, 0)) { - BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_nv1x_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_nv1x_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(11, 0, 0)) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) + > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx10_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); - } else { + } else if (KFD_GC_VERSION(kfd) < IP_VERSION(12, 0, 0)) { + /* The gfx11 cwsr trap handler must fit inside a single + page. */ BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx11_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_hex) + > KFD_CWSR_TMA_OFFSET); + kfd->cwsr_isa = cwsr_trap_gfx12_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_hex); } kfd->cwsr_enabled = true; @@ -514,7 +575,9 @@ static int kfd_gws_init(struct kfd_node *node) && kfd->mec2_fw_version >= 0x30) || (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 2) && kfd->mec2_fw_version >= 0x28) || - (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) || + (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(node) == IP_VERSION(9, 4, 4)) || + (KFD_GC_VERSION(node) == IP_VERSION(9, 5, 0)) || (KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0) && KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0) && kfd->mec2_fw_version >= 0x6b) || @@ -586,6 +649,14 @@ static void kfd_cleanup_nodes(struct kfd_dev *kfd, unsigned int num_nodes) struct kfd_node *knode; unsigned int i; + /* + * flush_work ensures that there are no outstanding + * work-queue items that will access interrupt_ring. New work items + * can't be created because we stopped interrupt handling above. + */ + flush_workqueue(kfd->ih_wq); + destroy_workqueue(kfd->ih_wq); + for (i = 0; i < num_nodes; i++) { knode = kfd->nodes[i]; device_queue_manager_uninit(knode->dqm); @@ -681,14 +752,14 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1; vmid_num_kfd = last_vmid_kfd - first_vmid_kfd + 1; - /* For GFX9.4.3, we need special handling for VMIDs depending on - * partition mode. + /* For multi-partition capable GPUs, we need special handling for VMIDs + * depending on partition mode. * In CPX mode, the VMID range needs to be shared between XCDs. * Additionally, there are 13 VMIDs (3-15) available for KFD. To * divide them equally, we change starting VMID to 4 and not use * VMID 3. - * If the VMID range changes for GFX9.4.3, then this code MUST be - * revisited. + * If the VMID range changes for multi-partition capable GPUs, then + * this code MUST be revisited. */ if (kfd->adev->xcp_mgr) { partition_mode = amdgpu_xcp_query_partition_mode(kfd->adev->xcp_mgr, @@ -753,11 +824,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd->hive_id = kfd->adev->gmc.xgmi.hive_id; /* - * For GFX9.4.3, the KFD abstracts all partitions within a socket as - * xGMI connected in the topology so assign a unique hive id per - * device based on the pci device location if device is in PCIe mode. + * For multi-partition capable GPUs, the KFD abstracts all partitions + * within a socket as xGMI connected in the topology so assign a unique + * hive id per device based on the pci device location if device is in + * PCIe mode. */ - if (!kfd->hive_id && (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) && kfd->num_nodes > 1) + if (!kfd->hive_id && kfd->num_nodes > 1) kfd->hive_id = pci_dev_id(kfd->adev->pdev); kfd->noretry = kfd->adev->gmc.noretry; @@ -795,11 +867,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, KFD_XCP_MEMORY_SIZE(node->adev, node->node_id) >> 20); } - if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) && - partition_mode == AMDGPU_CPX_PARTITION_MODE && + if (partition_mode == AMDGPU_CPX_PARTITION_MODE && kfd->num_nodes != 1) { - /* For GFX9.4.3 and CPX mode, first XCD gets VMID range - * 4-9 and second XCD gets VMID range 10-15. + /* For multi-partition capable GPUs and CPX mode, first + * XCD gets VMID range 4-9 and second XCD gets VMID + * range 10-15. */ node->vm_info.first_vmid_kfd = (i%2 == 0) ? @@ -823,7 +895,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, amdgpu_amdkfd_get_local_mem_info(kfd->adev, &node->local_mem_info, node->xcp); - if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) + if (kfd->adev->xcp_mgr) kfd_setup_interrupt_bitmap(node, i); /* Initialize the KFD node */ @@ -831,13 +903,14 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, dev_err(kfd_device, "Error initializing KFD node\n"); goto node_init_error; } + + spin_lock_init(&node->watch_points_lock); + kfd->nodes[i] = node; } svm_range_set_max_pages(kfd->adev); - spin_lock_init(&kfd->watch_points_lock); - kfd->init_complete = true; dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor, kfd->adev->pdev->device); @@ -854,7 +927,7 @@ node_alloc_error: kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); alloc_gtt_mem_failure: dev_err(kfd_device, "device %x:%x NOT added due to errors\n", @@ -872,13 +945,14 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); ida_destroy(&kfd->doorbell_ida); kfd_gtt_sa_fini(kfd); - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); } kfree(kfd); } -int kgd2kfd_pre_reset(struct kfd_dev *kfd) +int kgd2kfd_pre_reset(struct kfd_dev *kfd, + struct amdgpu_reset_context *reset_context) { struct kfd_node *node; int i; @@ -888,8 +962,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; - kfd_smi_event_update_gpu_reset(node, false); - node->dqm->ops.pre_reset(node->dqm); + kfd_smi_event_update_gpu_reset(node, false, reset_context); } kgd2kfd_suspend(kfd, false); @@ -928,7 +1001,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; atomic_set(&node->sram_ecc_flag, 0); - kfd_smi_event_update_gpu_reset(node, true); + kfd_smi_event_update_gpu_reset(node, true, NULL); } return 0; @@ -944,7 +1017,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { struct kfd_node *node; int i; - int count; if (!kfd->init_complete) return; @@ -952,12 +1024,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) { @@ -968,7 +1038,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count, i; + int ret, i; if (!kfd->init_complete) return 0; @@ -982,12 +1052,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); } return ret; @@ -1006,21 +1074,6 @@ static int kfd_resume(struct kfd_node *node) return err; } -static inline void kfd_queue_work(struct workqueue_struct *wq, - struct work_struct *work) -{ - int cpu, new_cpu; - - cpu = new_cpu = smp_processor_id(); - do { - new_cpu = cpumask_next(new_cpu, cpu_online_mask) % nr_cpu_ids; - if (cpu_to_node(new_cpu) == numa_node_id()) - break; - } while (cpu != new_cpu); - - queue_work_on(new_cpu, wq, work); -} - /* This is called directly from KGD at ISR. */ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) { @@ -1046,7 +1099,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) patched_ihre, &is_patched) && enqueue_ih_ring_entry(node, is_patched ? patched_ihre : ih_ring_entry)) { - kfd_queue_work(node->ih_wq, &node->interrupt_work); + queue_work(node->kfd->ih_wq, &node->interrupt_work); spin_unlock_irqrestore(&node->interrupt_lock, flags); return; } @@ -1343,6 +1396,13 @@ void kfd_dec_compute_active(struct kfd_node *node) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +static bool kfd_compute_active(struct kfd_node *node) +{ + if (atomic_read(&node->kfd->compute_profile)) + return true; + return false; +} + void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask) { /* @@ -1397,6 +1457,130 @@ void kgd2kfd_unlock_kfd(void) mutex_unlock(&kfd_processes_mutex); } +int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + int ret; + + if (!kfd->init_complete) + return 0; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return -EINVAL; + } + node = kfd->nodes[node_id]; + + ret = node->dqm->ops.unhalt(node->dqm); + if (ret) + dev_err(kfd_device, "Error in starting scheduler\n"); + + return ret; +} + +int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + + if (!kfd->init_complete) + return 0; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return -EINVAL; + } + + node = kfd->nodes[node_id]; + return node->dqm->ops.halt(node->dqm); +} + +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + + if (!kfd->init_complete) + return false; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return false; + } + + node = kfd->nodes[node_id]; + + return kfd_compute_active(node); +} + +/** + * kgd2kfd_vmfault_fast_path() - KFD vm page fault interrupt handling fast path for gmc v9 + * @adev: amdgpu device + * @entry: vm fault interrupt vector + * @retry_fault: if this is retry fault + * + * retry fault - + * with CAM enabled, adev primary ring + * | gmc_v9_0_process_interrupt() + * adev soft_ring + * | gmc_v9_0_process_interrupt() worker failed to recover page fault + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * without CAM, adev primary ring1 + * | gmc_v9_0_process_interrupt worker failed to recvoer page fault + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * no-retry fault - + * adev primary ring + * | gmc_v9_0_process_interrupt() + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * fast path - After kfd_signal_vm_fault_event, gmc_v9_0_process_interrupt drop the page fault + * of same process, don't copy interrupt to KFD node ih_fifo. + * With gdb debugger enabled, need convert the retry fault to no-retry fault for + * debugger, cannot use the fast path. + * + * Return: + * true - use the fast path to handle this fault + * false - use normal path to handle it + */ +bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, + bool retry_fault) +{ + struct kfd_process *p; + u32 cam_index; + + if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) { + p = kfd_lookup_process_by_pasid(entry->pasid); + if (!p) + return true; + + if (p->gpu_page_fault && !p->debug_trap_enabled) { + if (retry_fault && adev->irq.retry_cam_enabled) { + cam_index = entry->src_data[2] & 0x3ff; + WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); + } + + kfd_unref_process(p); + return true; + } + + /* + * This is the first page fault, set flag and then signal user space + */ + p->gpu_page_fault = true; + kfd_unref_process(p); + } + return false; +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c0e71543389a..34c2c42c0f95 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -35,7 +35,8 @@ #include "cik_regs.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" -#include "mes_api_def.h" +#include "amdgpu_reset.h" +#include "mes_v11_api_def.h" #include "kfd_debug.h" /* Size of the per-pipe EOP queue */ @@ -152,17 +153,24 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, static void kfd_hws_hang(struct device_queue_manager *dqm) { + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + /* Mark all device queues as reset. */ + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + pdd->has_reset_queue = true; + } + } + /* * Issue a GPU reset if HWS is unresponsive */ - dqm->is_hws_hang = true; - - /* It's possible we're detecting a HWS hang in the - * middle of a GPU reset. No need to schedule another - * reset in this case. - */ - if (!dqm->is_resetting) - schedule_work(&dqm->hw_exception_work); + schedule_work(&dqm->hw_exception_work); } static int convert_to_mes_queue_type(int queue_type) @@ -194,9 +202,26 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r, queue_type; uint64_t wptr_addr_off; - if (dqm->is_hws_hang) + if (!dqm->sched_running || dqm->sched_halt) + return 0; + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!pdd->proc_ctx_cpu_ptr) { + r = amdgpu_amdkfd_alloc_gtt_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (r) { + dev_err(adev->dev, + "failed to allocate process context bo\n"); + return r; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); queue_input.process_id = qpd->pqm->process->pasid; queue_input.page_table_base_addr = qpd->page_table_base; @@ -214,10 +239,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.mqd_addr = q->gart_mqd_addr; queue_input.wptr_addr = (uint64_t)q->properties.write_ptr; - if (q->wptr_bo) { - wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); - queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->wptr_bo) + wptr_addr_off; - } + wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); + queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->properties.wptr_bo) + wptr_addr_off; queue_input.is_kfd_process = 1; queue_input.is_aql_queue = (q->properties.format == KFD_QUEUE_FORMAT_AQL); @@ -236,6 +259,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (queue_type < 0) { dev_err(adev->dev, "Queue type not supported with MES, queue:%d\n", q->properties.type); + up_read(&adev->reset_domain->sem); return -EINVAL; } queue_input.queue_type = (uint32_t)queue_type; @@ -245,6 +269,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n", q->properties.doorbell_off); @@ -262,7 +287,9 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r; struct mes_remove_queue_input queue_input; - if (dqm->is_hws_hang) + if (!dqm->sched_running || dqm->sched_halt) + return 0; + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); @@ -272,6 +299,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", @@ -283,7 +311,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_all_queues_mes(struct device_queue_manager *dqm) +static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; struct device *dev = dqm->dev->adev->dev; @@ -310,6 +338,73 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) +{ + struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q; + int retval = 0; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + retval = add_queue_mes(dqm, q, qpd); + if (retval) { + dev_err(dev, "%s: Failed to add queue %d for dev %d", + __func__, + q->properties.queue_id, + dqm->dev->id); + return retval; + } + } + } + + return retval; +} + +static int suspend_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r = 0; + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + + r = amdgpu_mes_suspend(adev); + up_read(&adev->reset_domain->sem); + + if (r) { + dev_err(adev->dev, "failed to suspend gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + +static int resume_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r = 0; + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + + r = amdgpu_mes_resume(adev); + up_read(&adev->reset_domain->sem); + + if (r) { + dev_err(adev->dev, "failed to resume gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + static void increment_queue_count(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -883,6 +978,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, else if (prev_active) retval = remove_queue_mes(dqm, q, &pdd->qpd); + /* queue is reset so inaccessable */ + if (pdd->has_reset_queue) { + retval = -EACCES; + goto out_unlock; + } + if (retval) { dev_err(dev, "unmap queue failed\n"); goto out_unlock; @@ -1129,11 +1230,13 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, decrement_queue_count(dqm, qpd, q); if (dqm->dev->kfd->shared_resources.enable_mes) { - retval = remove_queue_mes(dqm, q, qpd); - if (retval) { + int err; + + err = remove_queue_mes(dqm, q, qpd); + if (err) { dev_err(dev, "Failed to evict queue %d\n", q->properties.queue_id); - goto out; + retval = err; } } } @@ -1468,20 +1571,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm) } if (dqm->dev->adev->asic_type == CHIP_HAWAII) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); dqm->sched_running = false; dqm_unlock(dqm); return 0; } -static void pre_reset(struct device_queue_manager *dqm) -{ - dqm_lock(dqm); - dqm->is_resetting = true; - dqm_unlock(dqm); -} - static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id) { @@ -1544,6 +1640,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev); q->properties.sdma_queue_id = q->sdma_id / kfd_get_num_xgmi_sdma_engines(dqm->dev); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { + int i, num_queues, num_engines, eng_offset = 0, start_engine; + bool free_bit_found = false, is_xgmi = false; + + if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) { + num_queues = get_num_sdma_queues(dqm); + num_engines = kfd_get_num_sdma_engines(dqm->dev); + q->properties.type = KFD_QUEUE_TYPE_SDMA; + } else { + num_queues = get_num_xgmi_sdma_queues(dqm); + num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev); + eng_offset = kfd_get_num_sdma_engines(dqm->dev); + q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI; + is_xgmi = true; + } + + /* Scan available bit based on target engine ID. */ + start_engine = q->properties.sdma_engine_id - eng_offset; + for (i = start_engine; i < num_queues; i += num_engines) { + + if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap)) + continue; + + clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap); + q->sdma_id = i; + q->properties.sdma_queue_id = q->sdma_id / num_engines; + free_bit_found = true; + break; + } + + if (!free_bit_found) { + dev_err(dev, "No more SDMA queue to allocate for target ID %i\n", + q->properties.sdma_engine_id); + return -ENOMEM; + } } pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); @@ -1636,10 +1767,67 @@ static int initialize_cpsch(struct device_queue_manager *dqm) return 0; } +/* halt_cpsch: + * Unmap queues so the schedule doesn't continue remaining jobs in the queue. + * Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch + * is called. + */ +static int halt_cpsch(struct device_queue_manager *dqm) +{ + int ret = 0; + + dqm_lock(dqm); + if (!dqm->sched_running) { + dqm_unlock(dqm); + return 0; + } + + WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n"); + + if (!dqm->is_hws_hang) { + if (!dqm->dev->kfd->shared_resources.enable_mes) + ret = unmap_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, + USE_DEFAULT_GRACE_PERIOD, false); + else + ret = remove_all_kfd_queues_mes(dqm); + } + dqm->sched_halt = true; + dqm_unlock(dqm); + + return ret; +} + +/* unhalt_cpsch + * Unset dqm->sched_halt and map queues back to runlist + */ +static int unhalt_cpsch(struct device_queue_manager *dqm) +{ + int ret = 0; + + dqm_lock(dqm); + if (!dqm->sched_running || !dqm->sched_halt) { + WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n"); + dqm_unlock(dqm); + return 0; + } + dqm->sched_halt = false; + if (!dqm->dev->kfd->shared_resources.enable_mes) + ret = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + 0, USE_DEFAULT_GRACE_PERIOD); + else + ret = add_all_kfd_queues_mes(dqm); + + dqm_unlock(dqm); + + return ret; +} + static int start_cpsch(struct device_queue_manager *dqm) { struct device *dev = dqm->dev->adev->dev; - int retval; + int retval, num_hw_queue_slots; retval = 0; @@ -1669,8 +1857,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); /* clear hang status when driver try to start the hw scheduler */ - dqm->is_hws_hang = false; - dqm->is_resetting = false; dqm->sched_running = true; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1694,13 +1880,28 @@ static int start_cpsch(struct device_queue_manager *dqm) &dqm->wait_times); } + /* setup per-queue reset detection buffer */ + num_hw_queue_slots = dqm->dev->kfd->shared_resources.num_queue_per_pipe * + dqm->dev->kfd->shared_resources.num_pipe_per_mec * + NUM_XCC(dqm->dev->xcc_mask); + + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info); + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL); + + if (!dqm->detect_hang_info) { + retval = -ENOMEM; + goto fail_detect_hang_buffer; + } + dqm_unlock(dqm); return 0; +fail_detect_hang_buffer: + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); fail_allocate_vidmem: fail_set_sched_resources: if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); fail_packet_manager_init: dqm_unlock(dqm); return retval; @@ -1708,22 +1909,17 @@ fail_packet_manager_init: static int stop_cpsch(struct device_queue_manager *dqm) { - bool hanging; - dqm_lock(dqm); if (!dqm->sched_running) { dqm_unlock(dqm); return 0; } - if (!dqm->is_hws_hang) { - if (!dqm->dev->kfd->shared_resources.enable_mes) - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); - else - remove_all_queues_mes(dqm); - } + if (!dqm->dev->kfd->shared_resources.enable_mes) + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); + else + remove_all_kfd_queues_mes(dqm); - hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1731,7 +1927,9 @@ static int stop_cpsch(struct device_queue_manager *dqm) kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, hanging); + pm_uninit(&dqm->packet_mgr); + kfree(dqm->detect_hang_info); + dqm->detect_hang_info = NULL; dqm_unlock(dqm); return 0; @@ -1803,7 +2001,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, } if (q->properties.type == KFD_QUEUE_TYPE_SDMA || - q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI || + q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { dqm_lock(dqm); retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL); dqm_unlock(dqm); @@ -1900,9 +2099,13 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, { unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies; struct device *dev = dqm->dev->adev->dev; - uint64_t *fence_addr = dqm->fence_addr; + uint64_t *fence_addr = dqm->fence_addr; while (*fence_addr != fence_value) { + /* Fatal err detected, this response won't come */ + if (amdgpu_amdkfd_is_fed(dqm->dev->adev)) + return -EIO; + if (time_after(jiffies, end_jiffies)) { dev_err(dev, "qcm fence wait loop timeout expired\n"); /* In HWS case, this is used to halt the driver thread @@ -1926,7 +2129,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) struct device *dev = dqm->dev->adev->dev; int retval; - if (!dqm->sched_running) + if (!dqm->sched_running || dqm->sched_halt) return 0; if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0) return 0; @@ -1944,6 +2147,135 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) return retval; } +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid 0x%0x is reset\n", + q->properties.queue_id, q->process->pasid); + + pdd->has_reset_queue = true; + if (q->properties.is_active) { + q->properties.is_active = false; + decrement_queue_count(dqm, qpd, q); + } +} + +static int detect_queue_hang(struct device_queue_manager *dqm) +{ + int i; + + /* detect should be used only in dqm locked queue reset */ + if (WARN_ON(dqm->detect_hang_count > 0)) + return 0; + + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); + + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { + uint32_t mec, pipe, queue; + int xcc_id; + + mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe) + / dqm->dev->kfd->shared_resources.num_pipe_per_mec; + + if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap)) + continue; + + amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue); + + for_each_inst(xcc_id, dqm->dev->xcc_mask) { + uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr( + dqm->dev->adev, pipe, queue, xcc_id); + struct dqm_detect_hang_info hang_info; + + if (!queue_addr) + continue; + + hang_info.pipe_id = pipe; + hang_info.queue_id = queue; + hang_info.xcc_id = xcc_id; + hang_info.queue_address = queue_addr; + + dqm->detect_hang_info[dqm->detect_hang_count] = hang_info; + dqm->detect_hang_count++; + } + } + + return dqm->detect_hang_count; +} + +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (queue_address == q->properties.queue_address) + return q; + } + } + + return NULL; +} + +/* only for compute queue */ +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm) +{ + int r = 0, reset_count = 0, i; + + if (!dqm->detect_hang_info || dqm->is_hws_hang) + return -EIO; + + /* assume dqm locked. */ + if (!detect_queue_hang(dqm)) + return -ENOTRECOVERABLE; + + for (i = 0; i < dqm->detect_hang_count; i++) { + struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i]; + struct queue *q = find_queue_by_address(dqm, hang_info.queue_address); + struct kfd_process_device *pdd; + uint64_t queue_addr = 0; + + if (!q) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + pdd = kfd_get_process_device_data(dqm->dev, q->process); + if (!pdd) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev, + hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id, + KFD_UNMAP_LATENCY_MS); + + /* either reset failed or we reset an unexpected queue. */ + if (queue_addr != q->properties.queue_address) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + set_queue_as_reset(dqm, q, &pdd->qpd); + reset_count++; + } + + if (reset_count == dqm->detect_hang_count) + kfd_signal_reset_event(dqm->dev); + else + r = -ENOTRECOVERABLE; + +reset_fail: + dqm->detect_hang_count = 0; + + return r; +} + /* dqm->lock mutex has to be locked before calling this function */ static int unmap_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, @@ -1953,26 +2285,27 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; - int retval = 0; + int retval; if (!dqm->sched_running) return 0; - if (dqm->is_hws_hang || dqm->is_resetting) - return -EIO; if (!dqm->active_runlist) - return retval; + return 0; + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) + return -EIO; if (grace_period != USE_DEFAULT_GRACE_PERIOD) { retval = pm_update_grace_period(&dqm->packet_mgr, grace_period); if (retval) - return retval; + goto out; } retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset); if (retval) - return retval; + goto out; *dqm->fence_addr = KFD_FENCE_INIT; + mb(); pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ @@ -1981,7 +2314,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (retval) { dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); kfd_hws_hang(dqm); - return retval; + goto out; } /* In the current MEC firmware implementation, if compute queue @@ -1993,11 +2326,15 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, * check those fields */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - if (mqd_mgr->read_doorbell_id(dqm->packet_mgr.priv_queue->queue->mqd)) { - dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { while (halt_if_hws_hang) schedule(); - return -ETIME; + if (reset_queues_on_hws_hang(dqm)) { + dqm->is_hws_hang = true; + kfd_hws_hang(dqm); + retval = -ETIME; + goto out; + } } /* We need to reset the grace period value for this device */ @@ -2010,12 +2347,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, pm_release_ib(&dqm->packet_mgr); dqm->active_runlist = false; +out: + up_read(&dqm->dev->adev->reset_domain->sem); return retval; } /* only for compute queue */ -static int reset_queues_cpsch(struct device_queue_manager *dqm, - uint16_t pasid) +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid) { int retval; @@ -2036,13 +2374,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; - if (dqm->is_hws_hang) + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false); - if (retval) - return retval; - - return map_queues_cpsch(dqm); + if (!retval) + retval = map_queues_cpsch(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return retval; } static int wait_on_destroy_queue(struct device_queue_manager *dqm, @@ -2052,6 +2390,9 @@ static int wait_on_destroy_queue(struct device_queue_manager *dqm, q->process); int ret = 0; + if (WARN_ON(!pdd)) + return ret; + if (pdd->qpd.is_debug) return ret; @@ -2121,10 +2462,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, pdd->sdma_past_activity_counter += sdma_val; } - list_del(&q->list); - qpd->queue_count--; if (q->properties.is_active) { decrement_queue_count(dqm, qpd, q); + q->properties.is_active = false; if (!dqm->dev->kfd->shared_resources.enable_mes) { retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, @@ -2135,6 +2475,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, retval = remove_queue_mes(dqm, q, qpd); } } + list_del(&q->list); + qpd->queue_count--; /* * Unconditionally decrement this counter, regardless of the queue's @@ -2423,10 +2765,12 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, if (!dqm->dev->kfd->shared_resources.enable_mes) retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD); - if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { + if ((retval || qpd->reset_wavefronts) && + down_read_trylock(&dqm->dev->adev->reset_domain->sem)) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; + up_read(&dqm->dev->adev->reset_domain->sem); } /* Lastly, free mqd resources. @@ -2533,7 +2877,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) dqm->ops.initialize = initialize_cpsch; dqm->ops.start = start_cpsch; dqm->ops.stop = stop_cpsch; - dqm->ops.pre_reset = pre_reset; + dqm->ops.halt = halt_cpsch; + dqm->ops.unhalt = unhalt_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; dqm->ops.register_process = register_process; @@ -2554,7 +2899,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) /* initialize dqm for no cp scheduling */ dqm->ops.start = start_nocpsch; dqm->ops.stop = stop_nocpsch; - dqm->ops.pre_reset = pre_reset; dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; @@ -2593,7 +2937,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) break; default: - if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)) + device_queue_manager_init_v12(&dqm->asic_ops); + else if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) device_queue_manager_init_v11(&dqm->asic_ops); else if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1)) device_queue_manager_init_v10(&dqm->asic_ops); @@ -2629,7 +2975,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, { WARN(!mqd, "No hiq sdma mqd trunk to free"); - amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem); } void device_queue_manager_uninit(struct device_queue_manager *dqm) @@ -2641,6 +2987,95 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) kfree(dqm); } +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id) +{ + struct kfd_process_device *pdd; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct device_queue_manager *dqm = knode->dqm; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q = NULL; + int ret = 0; + + if (!p) + return -EINVAL; + + dqm_lock(dqm); + + pdd = kfd_get_process_device_data(dqm->dev, p); + if (pdd) { + qpd = &pdd->qpd; + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->doorbell_id == doorbell_id && q->properties.is_active) { + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + + q->properties.is_evicted = true; + q->properties.is_active = false; + decrement_queue_count(dqm, qpd, q); + + ret = remove_queue_mes(dqm, q, qpd); + if (ret) { + dev_err(dev, "Removing bad queue failed"); + goto out; + } + + ret = resume_all_queues_mes(dqm); + if (ret) + dev_err(dev, "Resuming all queues failed"); + + break; + } + } + } + +out: + dqm_unlock(dqm); + return ret; +} + +static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct device *dev = dqm->dev->adev->dev; + int ret = 0; + + /* Check if process is already evicted */ + dqm_lock(dqm); + if (qpd->evicted) { + /* Increment the evicted count to make sure the + * process stays evicted before its terminated. + */ + qpd->evicted++; + dqm_unlock(dqm); + goto out; + } + dqm_unlock(dqm); + + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + + ret = dqm->ops.evict_process_queues(dqm, qpd); + if (ret) { + dev_err(dev, "Evicting process queues failed"); + goto out; + } + + ret = resume_all_queues_mes(dqm); + if (ret) + dev_err(dev, "Resuming all queues failed"); + +out: + return ret; +} + int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) { struct kfd_process_device *pdd; @@ -2651,8 +3086,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) return -EINVAL; WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + if (pdd) { + if (dqm->dev->kfd->shared_resources.enable_mes) + ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); + else + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + } + kfd_unref_process(p); return ret; @@ -2788,7 +3228,7 @@ struct copy_context_work_handler_workarea { struct kfd_process *p; }; -static void copy_context_work_handler (struct work_struct *work) +static void copy_context_work_handler(struct work_struct *work) { struct copy_context_work_handler_workarea *workarea; struct mqd_manager *mqd_mgr; @@ -2815,6 +3255,9 @@ static void copy_context_work_handler (struct work_struct *work) struct qcm_process_device *qpd = &pdd->qpd; list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE) + continue; + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP]; /* We ignore the return value from get_wave_state @@ -3155,6 +3598,30 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) return debug_map_and_unlock(dqm); } +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off, u32 *queue_format) +{ + struct queue *q; + bool r = false; + + if (!queue_format) + return r; + + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.doorbell_off == doorbell_off) { + *queue_format = q->properties.format; + r = true; + goto out; + } + } + +out: + dqm_unlock(dqm); + return r; +} #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index cf7e182588f8..09ab36f8e8c6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -106,6 +106,12 @@ union GRBM_GFX_INDEX_BITS { * @uninitialize: Destroys all the device queue manager resources allocated in * initialize routine. * + * @halt: This routine unmaps queues from runlist and set halt status to true + * so no more queues will be mapped to runlist until unhalt. + * + * @unhalt: This routine unset halt status to flase and maps queues back to + * runlist. + * * @create_kernel_queue: Creates kernel queue. Used for debug queue. * * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue. @@ -152,8 +158,9 @@ struct device_queue_manager_ops { int (*initialize)(struct device_queue_manager *dqm); int (*start)(struct device_queue_manager *dqm); int (*stop)(struct device_queue_manager *dqm); - void (*pre_reset)(struct device_queue_manager *dqm); void (*uninitialize)(struct device_queue_manager *dqm); + int (*halt)(struct device_queue_manager *dqm); + int (*unhalt)(struct device_queue_manager *dqm); int (*create_kernel_queue)(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd); @@ -211,6 +218,13 @@ struct device_queue_manager_asic_ops { struct kfd_node *dev); }; +struct dqm_detect_hang_info { + int pipe_id; + int queue_id; + int xcc_id; + uint64_t queue_address; +}; + /** * struct device_queue_manager * @@ -258,6 +272,7 @@ struct device_queue_manager { struct work_struct hw_exception_work; struct kfd_mem_obj hiq_sdma_mqd; bool sched_running; + bool sched_halt; /* used for GFX 9.4.3 only */ uint32_t current_logical_xcc_start; @@ -265,6 +280,11 @@ struct device_queue_manager { uint32_t wait_times; wait_queue_head_t destroy_wait; + + /* for per-queue reset support */ + struct dqm_detect_hang_info *detect_hang_info; + size_t detect_hang_info_size; + int detect_hang_count; }; void device_queue_manager_init_cik( @@ -277,6 +297,8 @@ void device_queue_manager_init_v10( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v11( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v12( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_cp_queues_num(struct device_queue_manager *dqm); @@ -302,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q, int debug_lock_and_unmap(struct device_queue_manager *dqm); int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off, u32 *queue_format); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c new file mode 100644 index 000000000000..4f3295b29dfb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "gc/gc_12_0_0_sh_mask.h" +#include "soc24_enum.h" + +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v12( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v12; + asic_ops->init_sdma_vm = init_sdma_vm_v12; + asic_ops->mqd_manager_init = mqd_manager_init_v12; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 54eb1bff903c..67137e674f1d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -63,7 +63,9 @@ static int update_qpd_v9(struct device_queue_manager *dqm, if (dqm->dev->kfd->noretry) qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; - if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 739721254a5d..d075f24e5f9f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -31,6 +31,7 @@ #include <linux/memory.h> #include "kfd_priv.h" #include "kfd_events.h" +#include "kfd_device_queue_manager.h" #include <linux/device.h> /* @@ -747,6 +748,16 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, uint64_t *slots = page_slots(p->signal_page); uint32_t id; + /* + * If id is valid but slot is not signaled, GPU may signal the same event twice + * before driver have chance to process the first interrupt, then signal slot is + * auto-reset after set_event wakeup the user space, just drop the second event as + * the application only need wakeup once. + */ + if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) && + partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT) + goto out_unlock; + if (valid_id_bits) pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", partial_id, valid_id_bits); @@ -775,6 +786,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, } } +out_unlock: rcu_read_unlock(); kfd_unref_process(p); } @@ -1244,12 +1256,33 @@ void kfd_signal_reset_event(struct kfd_node *dev) idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); continue; } + if (unlikely(!pdd)) { + WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid); + continue; + } + + if (dev->dqm->detect_hang_count && !pdd->has_reset_queue) + continue; + + if (dev->dqm->detect_hang_count) { + struct amdgpu_task_info *ti; + + ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid); + if (ti) { + dev_err(dev->adev->dev, + "Queues reset on process %s tid %d thread %s pid %d\n", + ti->process_name, ti->tgid, ti->task_name, ti->pid); + amdgpu_vm_put_task_info(ti); + } + } + rcu_read_lock(); id = KFD_FIRST_NONSIGNAL_EVENT_ID; @@ -1285,8 +1318,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; int user_gpu_id; - if (!p) + if (!p) { + dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", pasid); return; /* Presumably process exited. */ + } user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); if (unlikely(user_gpu_id == -EINVAL)) { @@ -1322,6 +1357,8 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) } } + dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n", + p->lead_thread->comm, pasid); rcu_read_unlock(); /* user application will handle SIGBUS signal */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 6604a3f99c5e..dbcb60eb54b2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -36,6 +36,7 @@ #include <linux/mm.h> #include <linux/mman.h> #include <linux/processor.h> +#include "amdgpu_vm.h" /* * The primary memory I/O features being added for revisions of gfxip @@ -326,10 +327,16 @@ static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) * with small reserved space for kernel. * Set them to CANONICAL addresses. */ - pdd->gpuvm_base = SVM_USER_BASE; + pdd->gpuvm_base = max(SVM_USER_BASE, AMDGPU_VA_RESERVED_BOTTOM); pdd->gpuvm_limit = pdd->dev->kfd->shared_resources.gpuvm_size - 1; + /* dGPUs: the reserved space for kernel + * before SVM + */ + pdd->qpd.cwsr_base = SVM_CWSR_BASE; + pdd->qpd.ib_base = SVM_IB_BASE; + pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); } @@ -339,18 +346,18 @@ static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) pdd->lds_base = MAKE_LDS_APP_BASE_V9(); pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - /* Raven needs SVM to support graphic handle, etc. Leave the small - * reserved space before SVM on Raven as well, even though we don't - * have to. - * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they - * are used in Thunk to reserve SVM. - */ - pdd->gpuvm_base = SVM_USER_BASE; + pdd->gpuvm_base = AMDGPU_VA_RESERVED_BOTTOM; pdd->gpuvm_limit = pdd->dev->kfd->shared_resources.gpuvm_size - 1; pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); + + /* + * Place TBA/TMA on opposite side of VM hole to prevent + * stray faults from triggering SVM on these pages. + */ + pdd->qpd.cwsr_base = AMDGPU_VA_RESERVED_TRAP_START(pdd->dev->adev); } int kfd_init_apertures(struct kfd_process *process) @@ -373,7 +380,8 @@ int kfd_init_apertures(struct kfd_process *process) pdd = kfd_create_process_device_data(dev, process); if (!pdd) { - pr_err("Failed to create process device data\n"); + dev_err(dev->adev->dev, + "Failed to create process device data\n"); return -ENOMEM; } /* @@ -407,12 +415,6 @@ int kfd_init_apertures(struct kfd_process *process) return -EINVAL; } } - - /* dGPUs: the reserved space for kernel - * before SVM - */ - pdd->qpd.cwsr_base = SVM_CWSR_BASE; - pdd->qpd.ib_base = SVM_IB_BASE; } dev_dbg(kfd_device, "node id %u\n", id); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index a7697ec8188e..37b69fe0ede3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -129,57 +129,6 @@ enum SQ_INTERRUPT_ERROR_TYPE { KFD_DEBUG_CP_BAD_OP_ECODE_MASK) \ >> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT) -static void event_interrupt_poison_consumption(struct kfd_node *dev, - uint16_t pasid, uint16_t client_id) -{ - int old_poison, ret = -EINVAL; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - - if (!p) - return; - - /* all queues of a process will be unmapped in one time */ - old_poison = atomic_cmpxchg(&p->poison, 0, 1); - kfd_unref_process(p); - if (old_poison) - return; - - switch (client_id) { - case SOC15_IH_CLIENTID_SE0SH: - case SOC15_IH_CLIENTID_SE1SH: - case SOC15_IH_CLIENTID_SE2SH: - case SOC15_IH_CLIENTID_SE3SH: - case SOC15_IH_CLIENTID_UTCL2: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); - break; - case SOC15_IH_CLIENTID_SDMA0: - case SOC15_IH_CLIENTID_SDMA1: - case SOC15_IH_CLIENTID_SDMA2: - case SOC15_IH_CLIENTID_SDMA3: - case SOC15_IH_CLIENTID_SDMA4: - break; - default: - break; - } - - kfd_signal_poison_consumed_event(dev, pasid); - - /* resetting queue passes, do page retirement without gpu reset - * resetting queue fails, fallback to gpu reset solution - */ - if (!ret) { - dev_warn(dev->adev->dev, - "RAS poison consumption, unmap queue flow succeeded: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); - } else { - dev_warn(dev->adev->dev, - "RAS poison consumption, fall back to gpu reset flow: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); - } -} - static bool event_interrupt_isr_v10(struct kfd_node *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, @@ -326,17 +275,13 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID), sq_intr_err_type); - if (sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && - sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { - event_interrupt_poison_consumption(dev, pasid, source_id); - return; - } break; default: break; } kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), @@ -355,9 +300,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SDMA7) { if (source_id == SOC15_INTSRC_SDMA_TRAP) { kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); - } else if (source_id == SOC15_INTSRC_SDMA_ECC) { - event_interrupt_poison_consumption(dev, pasid, source_id); - return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || client_id == SOC15_IH_CLIENTID_VMC1 || @@ -366,12 +308,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; - if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { - event_interrupt_poison_consumption(dev, pasid, client_id); - return; - } - info.vmid = vmid; info.mc_id = client_id; info.page_addr = ih_ring_entry[4] | diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 2a65792fd116..b3f988b275a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -191,7 +191,9 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, uint16_t pasid, uint16_t source_id) { + enum amdgpu_ras_block block = 0; int ret = -EINVAL; + uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); if (!p) @@ -210,9 +212,14 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, case SOC15_INTSRC_SQ_INTERRUPT_MSG: if (dev->dqm->ops.reset_queues) ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC21_INTSRC_SDMA_ECC: default: + block = AMDGPU_RAS_BLOCK__GFX; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; } @@ -220,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, /* resetting queue passes, do page retirement without gpu reset resetting queue fails, fallback to gpu reset solution */ - if (!ret) - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); - else - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } static bool event_interrupt_isr_v11(struct kfd_node *dev, @@ -325,11 +329,15 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, /* CP */ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) kfd_signal_event_interrupt(pasid, context_id0, 32); - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) - kfd_set_dbg_ev_from_interrupt(dev, pasid, - KFD_CTXID0_DOORBELL_ID(context_id0), + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { + u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); + + kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id, KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), NULL, 0); + kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id); + } /* SDMA */ else if (source_id == SOC21_INTSRC_SDMA_TRAP) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 27cdaea40501..0cb5c582ce7d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -27,6 +27,7 @@ #include "soc15_int.h" #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" +#include "amdgpu_ras.h" /* * GFX9 SQ Interrupts @@ -143,8 +144,12 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { - int old_poison, ret = -EINVAL; + enum amdgpu_ras_block block = 0; + uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + u64 event_id; + int old_poison, ret; if (!p) return; @@ -161,34 +166,75 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE2SH: case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x00557300 and onwards */ + if (dev->adev->pm.fw_version < 0x00557300) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x05550C00 and onwards */ + if (dev->adev->pm.fw_version < 0x05550C00) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else { + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX); + break; + case SOC15_IH_CLIENTID_VMC: + case SOC15_IH_CLIENTID_VMC1: + block = AMDGPU_RAS_BLOCK__MMHUB; + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA2: case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: + block = AMDGPU_RAS_BLOCK__SDMA; + if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x00557300 and onwards */ + if (dev->adev->pm.fw_version < 0x00557300) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x05550C00 and onwards */ + if (dev->adev->pm.fw_version < 0x05550C00) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else { + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA); break; default: - break; + dev_warn(dev->adev->dev, + "client %d does not support poison consumption\n", client_id); + return; } + ret = amdgpu_ras_mark_ras_event(dev->adev, type); + if (ret) + return; + kfd_signal_poison_consumed_event(dev, pasid); - /* resetting queue passes, do page retirement without gpu reset - * resetting queue fails, fallback to gpu reset solution - */ - if (!ret) { - dev_warn(dev->adev->dev, - "RAS poison consumption, unmap queue flow succeeded: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); - } else { - dev_warn(dev->adev->dev, - "RAS poison consumption, fall back to gpu reset flow: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); - } + event_id = amdgpu_ras_acquire_event_id(dev->adev, type); + + RAS_EVENT_LOG(dev->adev, event_id, + "poison is consumed by client %d, kick off gpu reset flow\n", client_id); + + amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, + block, pasid, NULL, NULL, reset); } static bool context_id_expected(struct kfd_dev *dev) @@ -385,7 +431,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; } kfd_signal_event_interrupt(pasid, sq_int_data, 24); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), @@ -412,8 +459,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; - if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + if (source_id == SOC15_INTSRC_VMC_UTCL2_POISON) { event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index dd3c43c1ad70..783c2f5a04e4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -46,7 +46,7 @@ #include <linux/kfifo.h> #include "kfd_priv.h" -#define KFD_IH_NUM_ENTRIES 8192 +#define KFD_IH_NUM_ENTRIES 16384 static void interrupt_wq(struct work_struct *); @@ -62,11 +62,14 @@ int kfd_interrupt_init(struct kfd_node *node) return r; } - node->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); - if (unlikely(!node->ih_wq)) { - kfifo_free(&node->ih_fifo); - dev_err(node->adev->dev, "Failed to allocate KFD IH workqueue\n"); - return -ENOMEM; + if (!node->kfd->ih_wq) { + node->kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI | WQ_UNBOUND, + node->kfd->num_nodes); + if (unlikely(!node->kfd->ih_wq)) { + kfifo_free(&node->ih_fifo); + dev_err(node->adev->dev, "Failed to allocate KFD IH workqueue\n"); + return -ENOMEM; + } } spin_lock_init(&node->interrupt_lock); @@ -96,14 +99,6 @@ void kfd_interrupt_exit(struct kfd_node *node) spin_lock_irqsave(&node->interrupt_lock, flags); node->interrupts_active = false; spin_unlock_irqrestore(&node->interrupt_lock, flags); - - /* - * flush_work ensures that there are no outstanding - * work-queue items that will access interrupt_ring. New work items - * can't be created because we stopped interrupt handling above. - */ - flush_workqueue(node->ih_wq); - kfifo_free(&node->ih_fifo); } @@ -112,55 +107,48 @@ void kfd_interrupt_exit(struct kfd_node *node) */ bool enqueue_ih_ring_entry(struct kfd_node *node, const void *ih_ring_entry) { - int count; - - count = kfifo_in(&node->ih_fifo, ih_ring_entry, - node->kfd->device_info.ih_ring_entry_size); - if (count != node->kfd->device_info.ih_ring_entry_size) { - dev_dbg_ratelimited(node->adev->dev, - "Interrupt ring overflow, dropping interrupt %d\n", - count); + if (kfifo_is_full(&node->ih_fifo)) { + dev_warn_ratelimited(node->adev->dev, "KFD node %d ih_fifo overflow\n", + node->node_id); return false; } + kfifo_in(&node->ih_fifo, ih_ring_entry, node->kfd->device_info.ih_ring_entry_size); return true; } /* * Assumption: single reader/writer. This function is not re-entrant */ -static bool dequeue_ih_ring_entry(struct kfd_node *node, void *ih_ring_entry) +static bool dequeue_ih_ring_entry(struct kfd_node *node, u32 **ih_ring_entry) { int count; - count = kfifo_out(&node->ih_fifo, ih_ring_entry, - node->kfd->device_info.ih_ring_entry_size); - - WARN_ON(count && count != node->kfd->device_info.ih_ring_entry_size); + if (kfifo_is_empty(&node->ih_fifo)) + return false; + count = kfifo_out_linear_ptr(&node->ih_fifo, ih_ring_entry, + node->kfd->device_info.ih_ring_entry_size); + WARN_ON(count != node->kfd->device_info.ih_ring_entry_size); return count == node->kfd->device_info.ih_ring_entry_size; } static void interrupt_wq(struct work_struct *work) { - struct kfd_node *dev = container_of(work, struct kfd_node, - interrupt_work); - uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE]; + struct kfd_node *dev = container_of(work, struct kfd_node, interrupt_work); + uint32_t *ih_ring_entry; unsigned long start_jiffies = jiffies; - if (dev->kfd->device_info.ih_ring_entry_size > sizeof(ih_ring_entry)) { - dev_err_once(dev->adev->dev, "Ring entry too small\n"); - return; - } - - while (dequeue_ih_ring_entry(dev, ih_ring_entry)) { + while (dequeue_ih_ring_entry(dev, &ih_ring_entry)) { dev->kfd->device_info.event_interrupt_class->interrupt_wq(dev, ih_ring_entry); + kfifo_skip_count(&dev->ih_fifo, dev->kfd->device_info.ih_ring_entry_size); + if (time_is_before_jiffies(start_jiffies + HZ)) { /* If we spent more than a second processing signals, * reschedule the worker to avoid soft-lockup warnings */ - queue_work(dev->ih_wq, &dev->interrupt_work); + queue_work(dev->kfd->ih_wq, &dev->interrupt_work); break; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 1bea629c49ca..2b0a830f5b29 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -32,6 +32,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers.h" #include "kfd_pm4_opcodes.h" +#include "amdgpu_reset.h" #define PM4_COUNT_ZERO (((1 << 15) - 1) << 16) @@ -68,7 +69,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; break; default: - pr_err("Invalid queue type %d\n", type); + dev_err(dev->adev->dev, "Invalid queue type %d\n", type); return false; } @@ -78,13 +79,14 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, prop.doorbell_ptr = kfd_get_kernel_doorbell(dev->kfd, &prop.doorbell_off); if (!prop.doorbell_ptr) { - pr_err("Failed to initialize doorbell"); + dev_err(dev->adev->dev, "Failed to initialize doorbell"); goto err_get_kernel_doorbell; } retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); if (retval != 0) { - pr_err("Failed to init pq queues size %d\n", queue_size); + dev_err(dev->adev->dev, "Failed to init pq queues size %d\n", + queue_size); goto err_pq_allocate_vidmem; } @@ -123,7 +125,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, memset(kq->pq_kernel_addr, 0, queue_size); memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); - memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel)); + memset(kq->wptr_kernel, 0, dev->kfd->device_info.doorbell_size); prop.queue_size = queue_size; prop.is_interop = false; @@ -196,15 +198,17 @@ err_get_kernel_doorbell: } /* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq, bool hanging) +static void kq_uninitialize(struct kernel_queue *kq) { - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging) + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && down_read_trylock(&kq->dev->adev->reset_domain->sem)) { kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, kq->queue->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, kq->queue->pipe, kq->queue->queue); + up_read(&kq->dev->adev->reset_domain->sem); + } else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); @@ -286,7 +290,7 @@ err_no_space: return -ENOMEM; } -void kq_submit_packet(struct kernel_queue *kq) +int kq_submit_packet(struct kernel_queue *kq) { #ifdef DEBUG int i; @@ -298,15 +302,26 @@ void kq_submit_packet(struct kernel_queue *kq) } pr_debug("\n"); #endif + /* Fatal err detected, packet submission won't go through */ + if (amdgpu_amdkfd_is_fed(kq->dev->adev)) + return -EIO; + + /* Make sure ring buffer is updated before wptr updated */ + mb(); + if (kq->dev->kfd->device_info.doorbell_size == 8) { *kq->wptr64_kernel = kq->pending_wptr64; + mb(); /* Make sure wptr updated before ring doorbell */ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, kq->pending_wptr64); } else { *kq->wptr_kernel = kq->pending_wptr; + mb(); /* Make sure wptr updated before ring doorbell */ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, kq->pending_wptr); } + + return 0; } void kq_rollback_packet(struct kernel_queue *kq) @@ -332,15 +347,15 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev, if (kq_initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) return kq; - pr_err("Failed to init kernel queue\n"); + dev_err(dev->adev->dev, "Failed to init kernel queue\n"); kfree(kq); return NULL; } -void kernel_queue_uninit(struct kernel_queue *kq, bool hanging) +void kernel_queue_uninit(struct kernel_queue *kq) { - kq_uninitialize(kq, hanging); + kq_uninitialize(kq); kfree(kq); } @@ -351,26 +366,26 @@ static __attribute__((unused)) void test_kq(struct kfd_node *dev) uint32_t *buffer, i; int retval; - pr_err("Starting kernel queue test\n"); + dev_err(dev->adev->dev, "Starting kernel queue test\n"); kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); if (unlikely(!kq)) { - pr_err(" Failed to initialize HIQ\n"); - pr_err("Kernel queue test failed\n"); + dev_err(dev->adev->dev, " Failed to initialize HIQ\n"); + dev_err(dev->adev->dev, "Kernel queue test failed\n"); return; } retval = kq_acquire_packet_buffer(kq, 5, &buffer); if (unlikely(retval != 0)) { - pr_err(" Failed to acquire packet buffer\n"); - pr_err("Kernel queue test failed\n"); + dev_err(dev->adev->dev, " Failed to acquire packet buffer\n"); + dev_err(dev->adev->dev, "Kernel queue test failed\n"); return; } for (i = 0; i < 5; i++) buffer[i] = kq->nop_packet; kq_submit_packet(kq); - pr_err("Ending kernel queue test\n"); + dev_err(dev->adev->dev, "Ending kernel queue test\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index 9a6244430845..e24ee50acdf0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -47,7 +47,7 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq, size_t packet_size_in_dwords, unsigned int **buffer_ptr); -void kq_submit_packet(struct kernel_queue *kq); +int kq_submit_packet(struct kernel_queue *kq); void kq_rollback_packet(struct kernel_queue *kq); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index bdc01ca9609a..d05d199b5e44 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -77,7 +77,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo); amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, - dst_addr, num_bytes, false); + dst_addr, num_bytes, 0); amdgpu_ring_pad_ib(ring, &job->ibs[0]); WARN_ON(job->ibs[0].length_dw > num_dw); @@ -153,7 +153,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, } r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE, - NULL, &next, false, true, false); + NULL, &next, false, true, 0); if (r) { dev_err(adev->dev, "fail %d to copy memory\n", r); goto out_unlock; @@ -278,10 +278,11 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, dma_addr_t *scratch, uint64_t ttm_res_offset) { - uint64_t npages = migrate->cpages; + uint64_t npages = migrate->npages; struct amdgpu_device *adev = node->adev; struct device *dev = adev->dev; struct amdgpu_res_cursor cursor; + uint64_t mpages = 0; dma_addr_t *src; uint64_t *dst; uint64_t i, j; @@ -295,18 +296,20 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, amdgpu_res_first(prange->ttm_res, ttm_res_offset, npages << PAGE_SHIFT, &cursor); - for (i = j = 0; i < npages; i++) { + for (i = j = 0; (i < npages) && (mpages < migrate->cpages); i++) { struct page *spage; - dst[i] = cursor.start + (j << PAGE_SHIFT); - migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); - svm_migrate_get_vram_page(prange, migrate->dst[i]); - migrate->dst[i] = migrate_pfn(migrate->dst[i]); - + if (migrate->src[i] & MIGRATE_PFN_MIGRATE) { + dst[i] = cursor.start + (j << PAGE_SHIFT); + migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); + svm_migrate_get_vram_page(prange, migrate->dst[i]); + migrate->dst[i] = migrate_pfn(migrate->dst[i]); + mpages++; + } spage = migrate_pfn_to_page(migrate->src[i]); if (spage && !is_zone_device_page(spage)) { src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE, - DMA_TO_DEVICE); + DMA_BIDIRECTIONAL); r = dma_mapping_error(dev, src[i]); if (r) { dev_err(dev, "%s: fail %d dma_map_page\n", @@ -353,9 +356,12 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, out_free_vram_pages: if (r) { pr_debug("failed %d to copy memory to vram\n", r); - while (i--) { + for (i = 0; i < npages && mpages; i++) { + if (!dst[i]) + continue; svm_migrate_put_vram_page(adev, dst[i]); migrate->dst[i] = 0; + mpages--; } } @@ -445,14 +451,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", mpages, cpages, migrate.npages); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - 0, node->id, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + 0, node->id, trigger, r); out: if (!r && mpages) { pdd = svm_range_get_pdd_by_node(prange, node); @@ -509,10 +514,19 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, start = start_mgr << PAGE_SHIFT; end = (last_mgr + 1) << PAGE_SHIFT; + r = amdgpu_amdkfd_reserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); + if (r) { + dev_dbg(node->adev->dev, "failed to reserve VRAM, r: %ld\n", r); + return -ENOSPC; + } + r = svm_range_vram_node_new(node, prange, true); if (r) { dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r); - return r; + goto out; } ttm_res_offset = (start_mgr - prange->start + prange->offset) << PAGE_SHIFT; @@ -545,6 +559,11 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, svm_range_vram_node_free(prange); } +out: + amdgpu_amdkfd_unreserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); return r < 0 ? r : 0; } @@ -616,7 +635,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } - dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE); + dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); r = dma_mapping_error(dev, dst[i]); if (r) { dev_err(adev->dev, "%s: fail %d dma_map_page\n", __func__, r); @@ -737,14 +756,13 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(&migrate); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - node->id, 0, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + node->id, 0, trigger, r); out: if (!r && cpages) { mpages = cpages - upages; @@ -1009,7 +1027,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) return -EINVAL; - if (adev->gmc.is_app_apu) + if (adev->flags & AMD_IS_APU) return 0; pgmap = &kfddev->pgmap; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 050a6936ff84..d9ae854b6908 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -118,18 +118,20 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * attention grabbing. */ if (gfx_info->max_shader_engines > KFD_MAX_NUM_SE) { - pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", - gfx_info->max_shader_engines); + dev_err(mm->dev->adev->dev, + "Exceeded KFD_MAX_NUM_SE, chip reports %d\n", + gfx_info->max_shader_engines); return; } if (gfx_info->max_sh_per_se > KFD_MAX_NUM_SH_PER_SE) { - pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n", + dev_err(mm->dev->adev->dev, + "Exceeded KFD_MAX_NUM_SH, chip reports %d\n", gfx_info->max_sh_per_se * gfx_info->max_shader_engines); return; } cu_bitmap_sh_mul = (KFD_GC_VERSION(mm->dev) >= IP_VERSION(11, 0, 0) && - KFD_GC_VERSION(mm->dev) < IP_VERSION(12, 0, 0)) ? 2 : 1; + KFD_GC_VERSION(mm->dev) < IP_VERSION(13, 0, 0)) ? 2 : 1; /* Count active CUs per SH. * @@ -223,7 +225,7 @@ void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { if (mqd_mem_obj->gtt_mem) { - amdgpu_amdkfd_free_gtt_mem(mm->dev->adev, mqd_mem_obj->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(mm->dev->adev, &mqd_mem_obj->gtt_mem); kfree(mqd_mem_obj); } else { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); @@ -290,3 +292,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm, { return mm->mqd_size; } + +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, + uint32_t inst) +{ + if (doorbell_id) { + struct device *dev = node->adev->dev; + + if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0) + dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n", + inst, doorbell_id); + else + dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n", + doorbell_id); + return true; + } + + return false; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 57bf5e513f4d..17cc1f25c8d0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -119,7 +119,7 @@ struct mqd_manager { #if defined(CONFIG_DEBUG_FS) int (*debugfs_show_mqd)(struct seq_file *m, void *data); #endif - uint32_t (*read_doorbell_id)(void *mqd); + bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd); uint64_t (*mqd_stride)(struct mqd_manager *mm, struct queue_properties *p); @@ -128,6 +128,31 @@ struct mqd_manager { uint32_t mqd_size; }; +struct mqd_user_context_save_area_header { + /* Byte offset from start of user context + * save area to the last saved top (lowest + * address) of control stack data. Must be + * 4 byte aligned. + */ + uint32_t control_stack_offset; + + /* Byte size of the last saved control stack + * data. Must be 4 byte aligned. + */ + uint32_t control_stack_size; + + /* Byte offset from start of user context save + * area to the last saved base (lowest address) + * of wave state data. Must be 4 byte aligned. + */ + uint32_t wave_state_offset; + + /* Byte size of the last saved wave state data. + * Must be 4 byte aligned. + */ + uint32_t wave_state_size; +}; + struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_node *dev, struct queue_properties *q); @@ -173,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev); uint64_t kfd_mqd_stride(struct mqd_manager *mm, struct queue_properties *q); +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, + uint32_t inst); #endif /* KFD_MQD_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 1a4a69943c71..05f3ac2eaef9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct cik_mqd *m = (struct cik_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static void update_mqd(struct mqd_manager *mm, void *mqd, @@ -423,7 +423,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; break; case KFD_MQD_TYPE_DIQ: mqd->allocate_mqd = allocate_mqd; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index 22cbfa1bdadd..1695dd78ede8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -107,6 +107,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -167,10 +169,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); @@ -224,11 +226,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static int get_wave_state(struct mqd_manager *mm, void *mqd, @@ -488,7 +490,7 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; pr_debug("%s@%i\n", __func__, __LINE__); break; case KFD_MQD_TYPE_DIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 826bc4f6c8a7..3c0ae28c5923 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -154,6 +154,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -221,10 +223,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); @@ -278,11 +279,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static int get_wave_state(struct mqd_manager *mm, void *mqd, @@ -517,7 +518,7 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; pr_debug("%s@%i\n", __func__, __LINE__); break; case KFD_MQD_TYPE_DIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c new file mode 100644 index 000000000000..565858b9044d --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v12_structs.h" +#include "gc/gc_12_0_0_sh_mask.h" +#include "amdgpu_amdkfd.h" + +static inline struct v12_compute_mqd *get_mqd(void *mqd) +{ + return (struct v12_compute_mqd *)mqd; +} + +static inline struct v12_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v12_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct mqd_update_info *minfo) +{ + struct v12_compute_mqd *m; + uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; + + if (!minfo || !minfo->cu_mask.ptr) + return; + + mqd_symmetrically_map_cu_mask(mm, + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + m->compute_static_thread_mgmt_se4 = se_mask[4]; + m->compute_static_thread_mgmt_se5 = se_mask[5]; + m->compute_static_thread_mgmt_se6 = se_mask[6]; + m->compute_static_thread_mgmt_se7 = se_mask[7]; + + pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3, + m->compute_static_thread_mgmt_se4, + m->compute_static_thread_mgmt_se5, + m->compute_static_thread_mgmt_se6, + m->compute_static_thread_mgmt_se7); +} + +static void set_priority(struct v12_compute_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + /* + * Allocate one PAGE_SIZE memory for MQD as MES writes to areas beyond + * struct MQD size. + */ + if (kfd_gtt_sa_allocate(node, PAGE_SIZE, &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v12_compute_mqd *m; + + m = (struct v12_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, PAGE_SIZE); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + /* Set cp_hqd_hq_status0.c_queue_debug_en to 1 to have the CP set up the + * DISPATCH_PTR. This is required for the kfd debugger + */ + m->cp_hqd_hq_status0 = 1 << 14; + + if (amdgpu_amdkfd_have_atomics_support(mm->dev->adev)) + m->cp_hqd_hq_status0 |= 1 << 29; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->kfd->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q, NULL); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms, 0); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->kfd->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, minfo); + set_priority(m, q); + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) +{ + struct v12_compute_mqd *m = (struct v12_compute_mqd *)mqd; + + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v12_compute_mqd *m; + struct mqd_user_context_save_area_header header; + + m = get_mqd(mqd); + + /* Control stack is written backwards, while workgroup context data + * is written forwards. Both starts from m->cp_hqd_cntl_stack_size. + * Current position is at m->cp_hqd_cntl_stack_offset and + * m->cp_hqd_wg_state_offset, respectively. + */ + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + /* Control stack is not copied to user mode for GFXv12 because + * it's part of the context save area that is already + * accessible to user mode + */ + header.control_stack_size = *ctl_stack_used_size; + header.wave_state_size = *save_area_used_size; + + header.wave_state_offset = m->cp_hqd_wg_state_offset; + header.control_stack_offset = m->cp_hqd_cntl_stack_offset; + + if (copy_to_user(ctl_stack, &header, sizeof(header))) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_sdma_mqd *m; + + m = (struct v12_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, sizeof(struct v12_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q, NULL); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdmax_rlcx_sched_cntl = (amdgpu_sdma_phase_quantum + << SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM__SHIFT) + & SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM_MASK; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, + struct kfd_node *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); + mqd->get_wave_state = get_wave_state; + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = kfd_hiq_load_mqd_kiq; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + mqd->check_preemption_failed = check_preemption_failed; + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = kfd_load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = kfd_destroy_mqd_sdma; + mqd->is_occupied = kfd_is_occupied_sdma; + mqd->mqd_size = sizeof(struct v12_sdma_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 697b6d530d12..3014925d95ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -77,7 +77,9 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se1 = se_mask[1]; m->compute_static_thread_mgmt_se2 = se_mask[2]; m->compute_static_thread_mgmt_se3 = se_mask[3]; - if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 5, 0)) { m->compute_static_thread_mgmt_se4 = se_mask[4]; m->compute_static_thread_mgmt_se5 = se_mask[5]; m->compute_static_thread_mgmt_se6 = se_mask[6]; @@ -181,6 +183,9 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -243,7 +248,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); @@ -299,7 +304,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, if (mm->dev->kfd->cwsr_enabled && q->ctx_save_restore_area_address) m->cp_hqd_ctx_save_control = 0; - if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 5, 0)) update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); @@ -316,11 +323,14 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct v9_mqd *m = (struct v9_mqd *)mqd; + uint32_t doorbell_id = m->queue_doorbell_id0; + + m->queue_doorbell_id0 = 0; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); } static int get_wave_state(struct mqd_manager *mm, void *mqd, @@ -544,6 +554,9 @@ static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; + if (amdgpu_sriov_vf(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev); if (xcc == 0) { /* Set no_update_rptr = 0 in Master XCC */ @@ -607,6 +620,25 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, return err; } +static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) +{ + uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); + uint32_t xcc_mask = mm->dev->xcc_mask; + int inst = 0, xcc_id; + struct v9_mqd *m; + bool ret = false; + + for_each_inst(xcc_id, xcc_mask) { + m = get_mqd(mqd + hiq_mqd_size * inst); + ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, + m->queue_doorbell_id0, inst); + m->queue_doorbell_id0 = 0; + ++inst; + } + + return ret; +} + static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj, struct kfd_mem_obj *xcc_mqd_mem_obj, uint64_t offset) @@ -695,7 +727,7 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd + size * xcc); update_mqd(mm, m, q, minfo); - update_cu_mask(mm, mqd, minfo, xcc); + update_cu_mask(mm, m, minfo, xcc); if (q->format == KFD_QUEUE_FORMAT_AQL) { switch (xcc) { @@ -857,7 +889,9 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) { mqd->init_mqd = init_mqd_v9_4_3; mqd->load_mqd = load_mqd_v9_4_3; mqd->update_mqd = update_mqd_v9_4_3; @@ -881,15 +915,19 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { + mqd->check_preemption_failed = check_preemption_failed; + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) { mqd->init_mqd = init_mqd_hiq_v9_4_3; mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3; mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3; + mqd->check_preemption_failed = check_preemption_failed_v9_4_3; } else { mqd->init_mqd = init_mqd_hiq; mqd->load_mqd = kfd_hiq_load_mqd_kiq; mqd->destroy_mqd = destroy_hiq_mqd; + mqd->check_preemption_failed = check_preemption_failed; } break; case KFD_MQD_TYPE_DIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 3e1a574d4ea6..c1fafc502515 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -237,11 +237,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct vi_mqd *m = (struct vi_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static void update_mqd(struct mqd_manager *mm, void *mqd, @@ -482,7 +482,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; break; case KFD_MQD_TYPE_DIQ: mqd->allocate_mqd = allocate_mqd; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 401096c103b2..4984b41cd372 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -28,6 +28,10 @@ #include "kfd_kernel_queue.h" #include "kfd_priv.h" +#define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0) +#define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1) +#define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2) + static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, unsigned int buffer_size_bytes) { @@ -40,12 +44,13 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, - bool *over_subscription) + int *over_subscription) { unsigned int process_count, queue_count, compute_queue_count, gws_queue_count; unsigned int map_queue_size; unsigned int max_proc_per_quantum = 1; - struct kfd_node *dev = pm->dqm->dev; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; process_count = pm->dqm->processes_count; queue_count = pm->dqm->active_queue_count; @@ -57,17 +62,20 @@ static void pm_calc_rlib_size(struct packet_manager *pm, * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - *over_subscription = false; + *over_subscription = 0; - if (dev->max_proc_per_quantum > 1) - max_proc_per_quantum = dev->max_proc_per_quantum; + if (node->max_proc_per_quantum > 1) + max_proc_per_quantum = node->max_proc_per_quantum; - if ((process_count > max_proc_per_quantum) || - compute_queue_count > get_cp_queues_num(pm->dqm) || - gws_queue_count > 1) { - *over_subscription = true; - pr_debug("Over subscribed runlist\n"); - } + if (process_count > max_proc_per_quantum) + *over_subscription |= OVER_SUBSCRIPTION_PROCESS_COUNT; + if (compute_queue_count > get_cp_queues_num(pm->dqm)) + *over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT; + if (gws_queue_count > 1) + *over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT; + + if (*over_subscription) + dev_dbg(dev, "Over subscribed runlist\n"); map_queue_size = pm->pmf->map_queues_size; /* calculate run list ib allocation size */ @@ -81,15 +89,17 @@ static void pm_calc_rlib_size(struct packet_manager *pm, if (*over_subscription) *rlib_size += pm->pmf->runlist_size; - pr_debug("runlist ib size %d\n", *rlib_size); + dev_dbg(dev, "runlist ib size %d\n", *rlib_size); } static int pm_allocate_runlist_ib(struct packet_manager *pm, unsigned int **rl_buffer, uint64_t *rl_gpu_buffer, unsigned int *rl_buffer_size, - bool *is_over_subscription) + int *is_over_subscription) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval; if (WARN_ON(pm->allocated)) @@ -99,11 +109,10 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, mutex_lock(&pm->lock); - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, - &pm->ib_buffer_obj); + retval = kfd_gtt_sa_allocate(node, *rl_buffer_size, &pm->ib_buffer_obj); if (retval) { - pr_err("Failed to allocate runlist IB\n"); + dev_err(dev, "Failed to allocate runlist IB\n"); goto out; } @@ -125,12 +134,14 @@ static int pm_create_runlist_ib(struct packet_manager *pm, { unsigned int alloc_size_bytes; unsigned int *rl_buffer, rl_wptr, i; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval, processes_mapped; struct device_process_node *cur; struct qcm_process_device *qpd; struct queue *q; struct kernel_queue *kq; - bool is_over_subscription; + int is_over_subscription; rl_wptr = retval = processes_mapped = 0; @@ -142,7 +153,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, *rl_size_bytes = alloc_size_bytes; pm->ib_size_bytes = alloc_size_bytes; - pr_debug("Building runlist ib process count: %d queues count %d\n", + dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->active_queue_count); /* build the run list ib packet */ @@ -150,7 +161,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, qpd = cur->qpd; /* build map process packet */ if (processes_mapped >= pm->dqm->processes_count) { - pr_debug("Not enough space left in runlist IB\n"); + dev_dbg(dev, "Not enough space left in runlist IB\n"); pm_release_ib(pm); return -ENOMEM; } @@ -167,7 +178,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!kq->queue->properties.is_active) continue; - pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -186,7 +198,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!q->properties.is_active) continue; - pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -203,17 +216,24 @@ static int pm_create_runlist_ib(struct packet_manager *pm, } } - pr_debug("Finished map process and queues to runlist\n"); + dev_dbg(dev, "Finished map process and queues to runlist\n"); if (is_over_subscription) { if (!pm->is_over_subscription) - pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n"); + dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s. Expect reduced ROCm performance.\n", + is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ? + " too many processes." : "", + is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ? + " too many queues." : "", + is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ? + " multiple processes using cooperative launch." : ""); + retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), true); } - pm->is_over_subscription = is_over_subscription; + pm->is_over_subscription = !!is_over_subscription; for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) pr_debug("0x%2X ", rl_buffer[i]); @@ -239,7 +259,9 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) break; default: if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2) || - KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3)) + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 5, 0)) pm->pmf = &kfd_aldebaran_pm_funcs; else if (KFD_GC_VERSION(dqm->dev) >= IP_VERSION(9, 0, 1)) pm->pmf = &kfd_v9_pm_funcs; @@ -262,16 +284,18 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) return 0; } -void pm_uninit(struct packet_manager *pm, bool hanging) +void pm_uninit(struct packet_manager *pm) { mutex_destroy(&pm->lock); - kernel_queue_uninit(pm->priv_queue, hanging); + kernel_queue_uninit(pm->priv_queue); pm->priv_queue = NULL; } int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -281,14 +305,14 @@ int pm_send_set_resources(struct packet_manager *pm, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } retval = pm->pmf->set_resources(pm, buffer, res); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); @@ -325,7 +349,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) if (retval) goto fail_create_runlist; - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); mutex_unlock(&pm->lock); @@ -343,6 +367,8 @@ fail_create_runlist_ib: int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint64_t fence_value) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -354,14 +380,14 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); @@ -372,6 +398,8 @@ out: int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval = 0; uint32_t *buffer, size; @@ -385,14 +413,15 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, + "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } retval = pm->pmf->set_grace_period(pm, buffer, grace_period); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); } @@ -406,6 +435,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_unmap_queues_filter filter, uint32_t filter_param, bool reset) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -414,14 +445,14 @@ int pm_send_unmap_queue(struct packet_manager *pm, kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); @@ -463,6 +494,8 @@ out: int pm_debugfs_hang_hws(struct packet_manager *pm) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int r = 0; @@ -474,16 +507,16 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); r = -ENOMEM; goto out; } memset(buffer, 0x55, size); kq_submit_packet(pm->priv_queue); - pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", - buffer[0], buffer[1], buffer[2], buffer[3], - buffer[4], buffer[5], buffer[6]); + dev_info(dev, "Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", + buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], + buffer[5], buffer[6]); out: mutex_unlock(&pm->lock); return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 8ee2bedd301a..1f9f5bfeaf86 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -37,11 +37,14 @@ static int pm_map_process_v9(struct packet_manager *pm, struct kfd_node *kfd = pm->dqm->dev; struct kfd_process_device *pdd = container_of(qpd, struct kfd_process_device, qpd); + struct amdgpu_device *adev = kfd->adev; packet = (struct pm4_mes_map_process *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_process)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process)); + if (adev->enforce_isolation[kfd->node_id]) + packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; packet->bitfields2.pasid = qpd->pqm->process->pasid; @@ -89,14 +92,18 @@ static int pm_map_process_aldebaran(struct packet_manager *pm, struct pm4_mes_map_process_aldebaran *packet; uint64_t vm_page_table_base_addr = qpd->page_table_base; struct kfd_dev *kfd = pm->dqm->dev->kfd; + struct kfd_node *knode = pm->dqm->dev; struct kfd_process_device *pdd = container_of(qpd, struct kfd_process_device, qpd); int i; + struct amdgpu_device *adev = kfd->adev; packet = (struct pm4_mes_map_process_aldebaran *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process_aldebaran)); + if (adev->enforce_isolation[knode->node_id]) + packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; packet->bitfields2.pasid = qpd->pqm->process->pasid; @@ -144,17 +151,22 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, int concurrent_proc_cnt = 0; struct kfd_node *kfd = pm->dqm->dev; + struct amdgpu_device *adev = kfd->adev; /* Determine the number of processes to map together to HW: * it can not exceed the number of VMIDs available to the * scheduler, and it is determined by the smaller of the number * of processes in the runlist and kfd module parameter * hws_max_conc_proc. + * However, if enforce_isolation is set (toggle LDS/VGPRs/SGPRs + * cleaner between process switch), enable single-process mode + * in HWS. * Note: the arbitration between the number of VMIDs and * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - concurrent_proc_cnt = min(pm->dqm->processes_count, + concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ? + 1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum); packet = (struct pm4_mes_runlist *)buffer; @@ -213,7 +225,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, struct queue *q, bool is_static) { struct pm4_mes_map_queues *packet; - bool use_static = is_static; packet = (struct pm4_mes_map_queues *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); @@ -234,7 +245,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, switch (q->properties.type) { case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) + if (is_static) packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_latency_static_queue_vi; break; @@ -244,7 +255,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, break; case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: - use_static = false; /* no static queues under SDMA */ if (q->properties.sdma_engine_id < 2 && !pm_use_ext_eng(q->device->kfd)) packet->bitfields2.engine_sel = q->properties.sdma_engine_id + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h index 8b6b2bd5c148..cd8611401a66 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h @@ -145,8 +145,9 @@ struct pm4_mes_map_process { union { struct { - uint32_t pasid:16; - uint32_t reserved1:2; + uint32_t pasid:16; /* 0 - 15 */ + uint32_t reserved1:1; /* 16 */ + uint32_t exec_cleaner_shader:1; /* 17 */ uint32_t debug_vmid:4; uint32_t new_debug:1; uint32_t reserved2:1; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h index 38f5cb6a222a..e0ed62c4ade0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h @@ -37,7 +37,7 @@ struct pm4_mes_map_process_aldebaran { struct { uint32_t pasid:16; /* 0 - 15 */ uint32_t single_memops:1; /* 16 */ - uint32_t reserved1:1; /* 17 */ + uint32_t exec_cleaner_shader:1; /* 17 */ uint32_t debug_vmid:4; /* 18 - 21 */ uint32_t new_debug:1; /* 22 */ uint32_t tmz:1; /* 23 */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 80320b8603fc..d8cd913aa772 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -32,7 +32,7 @@ #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/spinlock.h> -#include <linux/kfd_ioctl.h> +#include <uapi/linux/kfd_ioctl.h> #include <linux/idr.h> #include <linux/kfifo.h> #include <linux/seq_file.h> @@ -99,11 +99,11 @@ /* * Size of the per-process TBA+TMA buffer: 2 pages * - * The first page is the TBA used for the CWSR ISA code. The second - * page is used as TMA for user-mode trap handler setup in daisy-chain mode. + * The first chunk is the TBA used for the CWSR ISA code. The second + * chunk is used as TMA for user-mode trap handler setup in daisy-chain mode. */ #define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) -#define KFD_CWSR_TMA_OFFSET PAGE_SIZE +#define KFD_CWSR_TMA_OFFSET (PAGE_SIZE + 2048) #define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \ (KFD_MAX_NUM_OF_PROCESSES * \ @@ -206,7 +206,9 @@ enum cache_policy { #define KFD_IS_SOC15(dev) ((KFD_GC_VERSION(dev)) >= (IP_VERSION(9, 0, 1))) #define KFD_SUPPORT_XNACK_PER_PROCESS(dev)\ ((KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) || \ - (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3))) + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) || \ + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) || \ + (KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0))) struct kfd_node; @@ -272,7 +274,6 @@ struct kfd_node { /* Interrupts */ struct kfifo ih_fifo; - struct workqueue_struct *ih_wq; struct work_struct interrupt_work; spinlock_t interrupt_lock; @@ -309,6 +310,10 @@ struct kfd_node { struct kfd_local_mem_info local_mem_info; struct kfd_dev *kfd; + + /* Track per device allocated watch points */ + uint32_t alloc_watch_ids; + spinlock_t watch_points_lock; }; struct kfd_dev { @@ -361,9 +366,7 @@ struct kfd_dev { struct kfd_node *nodes[MAX_KFD_NODES]; unsigned int num_nodes; - /* Track per device allocated watch points */ - uint32_t alloc_watch_ids; - spinlock_t watch_points_lock; + struct workqueue_struct *ih_wq; /* Kernel doorbells for KFD device */ struct amdgpu_bo *doorbells; @@ -413,13 +416,16 @@ enum kfd_unmap_queues_filter { * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. * * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface. + * + * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: SDMA user mode queue with target SDMA engine ID. */ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, KFD_QUEUE_TYPE_SDMA, KFD_QUEUE_TYPE_HIQ, KFD_QUEUE_TYPE_DIQ, - KFD_QUEUE_TYPE_SDMA_XGMI + KFD_QUEUE_TYPE_SDMA_XGMI, + KFD_QUEUE_TYPE_SDMA_BY_ENG_ID }; enum kfd_queue_format { @@ -493,8 +499,8 @@ struct queue_properties { uint64_t queue_size; uint32_t priority; uint32_t queue_percent; - uint32_t *read_ptr; - uint32_t *write_ptr; + void __user *read_ptr; + void __user *write_ptr; void __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; @@ -521,6 +527,12 @@ struct queue_properties { uint64_t tba_addr; uint64_t tma_addr; uint64_t exception_status; + + struct amdgpu_bo *wptr_bo; + struct amdgpu_bo *rptr_bo; + struct amdgpu_bo *ring_bo; + struct amdgpu_bo *eop_buf_bo; + struct amdgpu_bo *cwsr_bo; }; #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ @@ -603,7 +615,7 @@ struct queue { uint64_t gang_ctx_gpu_addr; void *gang_ctx_cpu_ptr; - struct amdgpu_bo *wptr_bo; + struct amdgpu_bo *wptr_bo_gart; }; enum KFD_MQD_TYPE { @@ -765,7 +777,7 @@ struct kfd_process_device { enum kfd_pdd_bound bound; /* VRAM usage */ - uint64_t vram_usage; + atomic64_t vram_usage; struct attribute attr_vram; char vram_filename[MAX_SYSFS_FILENAME_LEN]; @@ -836,6 +848,9 @@ struct kfd_process_device { void *proc_ctx_bo; uint64_t proc_ctx_gpu_addr; void *proc_ctx_cpu_ptr; + + /* Tracks queue reset status */ + bool has_reset_queue; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -853,6 +868,14 @@ struct svm_range_list { struct delayed_work restore_work; DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE); struct task_struct *faulting_task; + /* check point ts decides if page fault recovery need be dropped */ + uint64_t checkpoint_ts[MAX_GPU_INSTANCE]; + + /* Default granularity to use in buffer migration + * and restoration of backing memory while handling + * recoverable page faults + */ + uint8_t default_granularity; }; /* Process data */ @@ -981,6 +1004,9 @@ struct kfd_process { struct semaphore runtime_enable_sema; bool is_runtime_retry; struct kfd_runtime_info runtime_info; + + /* if gpu page fault sent to KFD */ + bool gpu_page_fault; }; #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ @@ -1128,7 +1154,9 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev, struct kfd_dev *dev = adev->kfd.dev; uint32_t i; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 5, 0)) return dev->nodes[0]; for (i = 0; i < dev->num_nodes; i++) @@ -1282,6 +1310,15 @@ int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); +int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, + u64 expected_size); +void kfd_queue_buffer_put(struct amdgpu_bo **bo); +int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo); +int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, + struct queue_properties *properties); +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_node *dev); @@ -1293,12 +1330,15 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, struct kfd_node *dev); struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, struct kfd_node *dev); +struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, + struct kfd_node *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_node *dev, enum kfd_queue_type type); -void kernel_queue_uninit(struct kernel_queue *kq, bool hanging); +void kernel_queue_uninit(struct kernel_queue *kq); int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid); +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id); /* Process Queue Manager */ struct process_queue_node { @@ -1313,10 +1353,8 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); void pqm_uninit(struct process_queue_manager *pqm); int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_node *dev, - struct file *f, struct queue_properties *properties, unsigned int *qid, - struct amdgpu_bo *wptr_bo, const struct kfd_criu_queue_priv_data *q_data, const void *restore_mqd, const void *restore_ctl_stack, @@ -1403,7 +1441,7 @@ extern const struct packet_manager_funcs kfd_v9_pm_funcs; extern const struct packet_manager_funcs kfd_aldebaran_pm_funcs; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm, bool hanging); +void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); @@ -1473,7 +1511,7 @@ static inline void kfd_flush_tlb(struct kfd_process_device *pdd, static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev) { - return KFD_GC_VERSION(dev) > IP_VERSION(9, 4, 2) || + return KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) || (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1) && dev->sdma_fw_version >= 18) || KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 717a60d7a4ea..083f83c94531 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -270,6 +270,9 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) struct kfd_node *dev = NULL; struct kfd_process *proc = NULL; struct kfd_process_device *pdd = NULL; + int i; + struct kfd_cu_occupancy *cu_occupancy; + u32 queue_format; pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy); dev = pdd->dev; @@ -287,11 +290,37 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) /* Collect wave count from device if it supports */ wave_cnt = 0; max_waves_per_cu = 0; - dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt, - &max_waves_per_cu, 0); + + cu_occupancy = kcalloc(AMDGPU_MAX_QUEUES, sizeof(*cu_occupancy), GFP_KERNEL); + if (!cu_occupancy) + return -ENOMEM; + + /* + * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition. + * For AQL queues, because of cooperative dispatch we multiply the wave count + * by number of XCCs in the partition to get the total wave counts across all + * XCCs in the partition. + * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is. + */ + dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, + &max_waves_per_cu, ffs(dev->xcc_mask) - 1); + + for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { + if (cu_occupancy[i].wave_cnt != 0 && + kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, + cu_occupancy[i].doorbell_off, + &queue_format)) { + if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4)) + wave_cnt += cu_occupancy[i].wave_cnt; + else + wave_cnt += (NUM_XCC(dev->xcc_mask) * + cu_occupancy[i].wave_cnt); + } + } /* Translate wave count to number of compute units */ cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu; + kfree(cu_occupancy); return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); } @@ -306,14 +335,14 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); } else if (strncmp(attr->name, "sdma_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_sdma); struct kfd_sdma_activity_handler_workarea sdma_activity_work_handler; - INIT_WORK(&sdma_activity_work_handler.sdma_activity_work, - kfd_sdma_activity_worker); + INIT_WORK_ONSTACK(&sdma_activity_work_handler.sdma_activity_work, + kfd_sdma_activity_worker); sdma_activity_work_handler.pdd = pdd; sdma_activity_work_handler.sdma_activity_counter = 0; @@ -321,6 +350,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, schedule_work(&sdma_activity_work_handler.sdma_activity_work); flush_work(&sdma_activity_work_handler.sdma_activity_work); + destroy_work_on_stack(&sdma_activity_work_handler.sdma_activity_work); return snprintf(buffer, PAGE_SIZE, "%llu\n", (sdma_activity_work_handler.sdma_activity_counter)/ @@ -819,16 +849,26 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) mutex_lock(&kfd_processes_mutex); if (kfd_is_locked()) { - mutex_unlock(&kfd_processes_mutex); pr_debug("KFD is locked! Cannot create process"); - return ERR_PTR(-EINVAL); + process = ERR_PTR(-EINVAL); + goto out; } - /* A prior open of /dev/kfd could have already created the process. */ - process = find_process(thread, false); + /* A prior open of /dev/kfd could have already created the process. + * find_process will increase process kref in this case + */ + process = find_process(thread, true); if (process) { pr_debug("Process already found\n"); } else { + /* If the process just called exec(3), it is possible that the + * cleanup of the kfd_process (following the release of the mm + * of the old process image) is still in the cleanup work queue. + * Make sure to drain any job before trying to recreate any + * resource for this process. + */ + flush_workqueue(kfd_process_wq); + process = create_process(thread); if (IS_ERR(process)) goto out; @@ -865,8 +905,6 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) init_waitqueue_head(&process->wait_irq_drain); } out: - if (!IS_ERR(process)) - kref_get(&process->ref); mutex_unlock(&kfd_processes_mutex); mmput(thread->mm); @@ -1038,9 +1076,10 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) kfd_free_process_doorbells(pdd->dev->kfd, pdd); - if (pdd->dev->kfd->shared_resources.enable_mes) + if (pdd->dev->kfd->shared_resources.enable_mes && + pdd->proc_ctx_cpu_ptr) amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev, - pdd->proc_ctx_bo); + &pdd->proc_ctx_bo); /* * before destroying pdd, make sure to report availability * for auto suspend @@ -1121,7 +1160,8 @@ static void kfd_process_wq_release(struct work_struct *work) */ synchronize_rcu(); ef = rcu_access_pointer(p->ef); - dma_fence_signal(ef); + if (ef) + dma_fence_signal(ef); kfd_process_remove_sysfs(p); @@ -1152,10 +1192,8 @@ static void kfd_process_ref_release(struct kref *ref) static struct mmu_notifier *kfd_process_alloc_notifier(struct mm_struct *mm) { - int idx = srcu_read_lock(&kfd_processes_srcu); - struct kfd_process *p = find_process_by_mm(mm); - - srcu_read_unlock(&kfd_processes_srcu, idx); + /* This increments p->ref counter if kfd process p exists */ + struct kfd_process *p = kfd_lookup_process_by_mm(mm); return p ? &p->mmu_notifier : ERR_PTR(-ESRCH); } @@ -1303,7 +1341,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) if (IS_ERR_VALUE(qpd->tba_addr)) { int err = qpd->tba_addr; - pr_err("Failure to set tba address. error %d.\n", err); + dev_err(dev->adev->dev, + "Failure to set tba address. error %d.\n", err); qpd->tba_addr = 0; qpd->cwsr_kaddr = NULL; return err; @@ -1571,7 +1610,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, struct kfd_process *p) { struct kfd_process_device *pdd = NULL; - int retval = 0; if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE)) return NULL; @@ -1590,25 +1628,11 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->bound = PDD_UNBOUND; pdd->already_dequeued = false; pdd->runtime_inuse = false; - pdd->vram_usage = 0; + atomic64_set(&pdd->vram_usage, 0); pdd->sdma_past_activity_counter = 0; pdd->user_gpu_id = dev->id; atomic64_set(&pdd->evict_duration_counter, 0); - if (dev->kfd->shared_resources.enable_mes) { - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, - AMDGPU_MES_PROC_CTX_SIZE, - &pdd->proc_ctx_bo, - &pdd->proc_ctx_gpu_addr, - &pdd->proc_ctx_cpu_ptr, - false); - if (retval) { - pr_err("failed to allocate process context bo\n"); - goto err_free_pdd; - } - memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); - } - p->pdds[p->n_pdds++] = pdd; if (kfd_dbg_is_per_vmid_supported(pdd->dev)) pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap( @@ -1620,10 +1644,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, idr_init(&pdd->alloc_idr); return pdd; - -err_free_pdd: - kfree(pdd); - return NULL; } /** @@ -1666,12 +1686,15 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm, &p->kgd_process_info, - &ef); + p->ef ? NULL : &ef); if (ret) { - pr_err("Failed to create process VM object\n"); + dev_err(dev->adev->dev, "Failed to create process VM object\n"); return ret; } - RCU_INIT_POINTER(p->ef, ef); + + if (!p->ef) + RCU_INIT_POINTER(p->ef, ef); + pdd->drm_priv = drm_file->private_data; ret = kfd_process_device_reserve_ib_mem(pdd); @@ -1715,7 +1738,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_node *dev, pdd = kfd_get_process_device_data(dev, p); if (!pdd) { - pr_err("Process device data doesn't exist\n"); + dev_err(dev->adev->dev, "Process device data doesn't exist\n"); return ERR_PTR(-ENOMEM); } @@ -1825,6 +1848,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + struct device *dev = pdd->dev->adev->dev; kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid, trigger); @@ -1836,10 +1860,12 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) * them been add back since they actually not be saved right now. */ if (r && r != -EIO) { - pr_err("Failed to evict process queues\n"); + dev_err(dev, "Failed to evict process queues\n"); goto fail; } n_evicted++; + + pdd->dev->dqm->is_hws_hang = false; } return r; @@ -1858,7 +1884,8 @@ fail: if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd)) - pr_err("Failed to restore queues\n"); + dev_err(pdd->dev->adev->dev, + "Failed to restore queues\n"); n_evicted--; } @@ -1874,13 +1901,14 @@ int kfd_process_restore_queues(struct kfd_process *p) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + struct device *dev = pdd->dev->adev->dev; kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid); r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd); if (r) { - pr_err("Failed to restore process queues\n"); + dev_err(dev, "Failed to restore process queues\n"); if (!ret) ret = r; } @@ -1922,6 +1950,8 @@ static int signal_eviction_fence(struct kfd_process *p) rcu_read_lock(); ef = dma_fence_get_rcu_safe(&p->ef); rcu_read_unlock(); + if (!ef) + return -EINVAL; ret = dma_fence_signal(ef); dma_fence_put(ef); @@ -1949,10 +1979,9 @@ static void evict_process_worker(struct work_struct *work) * they are responsible stopping the queues and scheduling * the restore work. */ - if (!signal_eviction_fence(p)) - queue_delayed_work(kfd_restore_wq, &p->restore_work, - msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - else + if (signal_eviction_fence(p) || + mod_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid); @@ -2011,9 +2040,9 @@ static void restore_process_worker(struct work_struct *work) if (ret) { pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", p->pasid, PROCESS_BACK_OFF_TIME_MS); - ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, - msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); + if (mod_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) + kfd_process_restore_queues(p); } } @@ -2056,7 +2085,7 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, struct qcm_process_device *qpd; if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { - pr_err("Incorrect CWSR mapping size.\n"); + dev_err(dev->adev->dev, "Incorrect CWSR mapping size.\n"); return -EINVAL; } @@ -2068,7 +2097,8 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(KFD_CWSR_TBA_TMA_SIZE)); if (!qpd->cwsr_kaddr) { - pr_err("Error allocating per process CWSR buffer.\n"); + dev_err(dev->adev->dev, + "Error allocating per process CWSR buffer.\n"); return -ENOMEM; } @@ -2098,9 +2128,11 @@ int kfd_process_drain_interrupts(struct kfd_process_device *pdd) irq_drain_fence[3] = pdd->process->pasid; /* - * For GFX 9.4.3, send the NodeId also in IH cookie DW[3] + * For GFX 9.4.3/9.5.0, send the NodeId also in IH cookie DW[3] */ - if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0)) { node_id = ffs(pdd->dev->interrupt_bitmap) - 1; irq_drain_fence[3] |= node_id << 16; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 4858112f9a53..bd36a75309e1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -28,6 +28,7 @@ #include "kfd_priv.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_reset.h" static inline struct process_queue_node *get_queue_by_qid( struct process_queue_manager *pqm, unsigned int qid) @@ -85,10 +86,17 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) if (pdd->already_dequeued) return; - + /* The MES context flush needs to filter out the case which the + * KFD process is created without setting up the MES context and + * queue for creating a compute queue. + */ dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); - if (dev->kfd->shared_resources.enable_mes) - amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr); + if (dev->kfd->shared_resources.enable_mes && !!pdd->proc_ctx_gpu_addr && + down_read_trylock(&dev->adev->reset_domain->sem)) { + amdgpu_mes_flush_shader_debugger(dev->adev, + pdd->proc_ctx_gpu_addr); + up_read(&dev->adev->reset_domain->sem); + } pdd->already_dequeued = true; } @@ -126,7 +134,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, if (!gws && pdd->qpd.num_gws == 0) return -EINVAL; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && !dev->kfd->shared_resources.enable_mes) { + if ((KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 5, 0)) && + !dev->kfd->shared_resources.enable_mes) { if (gws) ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info, gws, &mem); @@ -189,6 +200,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, if (pqn->q->gws) { if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 5, 0) && !dev->kfd->shared_resources.enable_mes) amdgpu_amdkfd_remove_gws_from_process( pqm->process->kgd_process_info, pqn->q->gws); @@ -196,9 +209,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, } if (dev->kfd->shared_resources.enable_mes) { - amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->gang_ctx_bo); - if (pqn->q->wptr_bo) - amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &pqn->q->gang_ctx_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart); } } @@ -207,8 +219,17 @@ void pqm_uninit(struct process_queue_manager *pqm) struct process_queue_node *pqn, *next; list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { - if (pqn->q) + if (pqn->q) { + struct kfd_process_device *pdd = kfd_get_process_device_data(pqn->q->device, + pqm->process); + if (pdd) { + kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); + kfd_queue_release_buffers(pdd, &pqn->q->properties); + } else { + WARN_ON(!pdd); + } pqm_clean_queue_resource(pqm, pqn); + } kfd_procfs_del_queue(pqn->q); uninit_queue(pqn->q); @@ -223,7 +244,6 @@ void pqm_uninit(struct process_queue_manager *pqm) static int init_user_queue(struct process_queue_manager *pqm, struct kfd_node *dev, struct queue **q, struct queue_properties *q_properties, - struct file *f, struct amdgpu_bo *wptr_bo, unsigned int qid) { int retval; @@ -255,12 +275,32 @@ static int init_user_queue(struct process_queue_manager *pqm, goto cleanup; } memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE); - (*q)->wptr_bo = wptr_bo; + + /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work + * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) + */ + if (((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) + >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { + if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) { + pr_err("Queue memory allocated to wrong device\n"); + retval = -EINVAL; + goto free_gang_ctx_bo; + } + + retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo, + &(*q)->wptr_bo_gart); + if (retval) { + pr_err("Failed to map wptr bo to GART\n"); + goto free_gang_ctx_bo; + } + } } pr_debug("PQM After init queue"); return 0; +free_gang_ctx_bo: + amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo); cleanup: uninit_queue(*q); *q = NULL; @@ -269,10 +309,8 @@ cleanup: int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_node *dev, - struct file *f, struct queue_properties *properties, unsigned int *qid, - struct amdgpu_bo *wptr_bo, const struct kfd_criu_queue_priv_data *q_data, const void *restore_mqd, const void *restore_ctl_stack, @@ -287,10 +325,12 @@ int pqm_create_queue(struct process_queue_manager *pqm, unsigned int max_queues = 127; /* HWS limit */ /* - * On GFX 9.4.3, increase the number of queues that - * can be created to 255. No HWS limit on GFX 9.4.3. + * On GFX 9.4.3/9.5.0, increase the number of queues that + * can be created to 255. No HWS limit on GFX 9.4.3/9.5.0. */ - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) max_queues = 255; q = NULL; @@ -336,13 +376,14 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: + case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: /* SDMA queues are always allocated statically no matter * which scheduler mode is used. We also do not need to * check whether a SDMA queue can be allocated here, because * allocate_sdma_queue() in create_queue() has the * corresponding check logic. */ - retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid); + retval = init_user_queue(pqm, dev, &q, properties, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; @@ -363,7 +404,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, goto err_create_queue; } - retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid); + retval = init_user_queue(pqm, dev, &q, properties, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; @@ -430,7 +471,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, err_create_queue: uninit_queue(q); if (kq) - kernel_queue_uninit(kq, false); + kernel_queue_uninit(kq); kfree(pqn); err_allocate_pqn: /* check if queues list is empty unregister process from device */ @@ -477,11 +518,14 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) /* destroy kernel queue (DIQ) */ dqm = pqn->kq->dev->dqm; dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); - kernel_queue_uninit(pqn->kq, false); + kernel_queue_uninit(pqn->kq); } if (pqn->q) { - kfd_procfs_del_queue(pqn->q); + retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); + if (retval) + goto err_destroy_queue; + dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { @@ -491,7 +535,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (retval != -ETIME) goto err_destroy_queue; } - + kfd_procfs_del_queue(pqn->q); + kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); uninit_queue(pqn->q); } @@ -515,11 +560,42 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, struct process_queue_node *pqn; pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { + if (!pqn || !pqn->q) { pr_debug("No queue %d exists for update operation\n", qid); return -EFAULT; } + /* + * Update with NULL ring address is used to disable the queue + */ + if (p->queue_address && p->queue_size) { + struct kfd_process_device *pdd; + struct amdgpu_vm *vm; + struct queue *q = pqn->q; + int err; + + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) + return -ENODEV; + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo, + p->queue_size)) { + pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n", + p->queue_address, p->queue_size); + return -EFAULT; + } + + kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo); + kfd_queue_buffer_put(&pqn->q->properties.ring_bo); + amdgpu_bo_unreserve(vm->root.bo); + + pqn->q->properties.ring_bo = p->ring_bo; + } + pqn->q->properties.queue_address = p->queue_address; pqn->q->properties.queue_size = p->queue_size; pqn->q->properties.queue_percent = p->queue_percent; @@ -962,8 +1038,7 @@ int kfd_criu_restore_queue(struct kfd_process *p, print_queue_properties(&qp); - ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL, q_data, mqd, ctl_stack, - NULL); + ret = pqm_create_queue(&p->pqm, pdd->dev, &qp, &queue_id, q_data, mqd, ctl_stack, NULL); if (ret) { pr_err("Failed to create new queue err:%d\n", ret); goto exit; @@ -979,6 +1054,7 @@ exit: pr_debug("Queue id %d was restored successfully\n", queue_id); kfree(q_data); + kfree(q_extra_data); return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 0f6992b1895c..4afff7094caf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -24,6 +24,8 @@ #include <linux/slab.h> #include "kfd_priv.h" +#include "kfd_topology.h" +#include "kfd_svm.h" void print_queue_properties(struct queue_properties *q) { @@ -82,3 +84,386 @@ void uninit_queue(struct queue *q) { kfree(q); } + +#if IS_ENABLED(CONFIG_HSA_AMD_SVM) + +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct list_head update_list; + struct svm_range *prange; + int ret = -EINVAL; + + INIT_LIST_HEAD(&update_list); + addr >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + + mutex_lock(&p->svms.lock); + + /* + * range may split to multiple svm pranges aligned to granularity boundaery. + */ + while (size) { + uint32_t gpuid, gpuidx; + int r; + + prange = svm_range_from_addr(&p->svms, addr, NULL); + if (!prange) + break; + + if (!prange->mapped_to_gpu) + break; + + r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx); + if (r < 0) + break; + if (!test_bit(gpuidx, prange->bitmap_access) && + !test_bit(gpuidx, prange->bitmap_aip)) + break; + + if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) + break; + + list_add(&prange->update_list, &update_list); + + if (prange->last - prange->start + 1 >= size) { + size = 0; + break; + } + + size -= prange->last - prange->start + 1; + addr += prange->last - prange->start + 1; + } + if (size) { + pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1); + goto out_unlock; + } + + list_for_each_entry(prange, &update_list, update_list) + atomic_inc(&prange->queue_refcount); + ret = 0; + +out_unlock: + mutex_unlock(&p->svms.lock); + return ret; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct svm_range *prange, *pchild; + struct interval_tree_node *node; + unsigned long last; + + addr >>= PAGE_SHIFT; + last = addr + (size >> PAGE_SHIFT) - 1; + + mutex_lock(&p->svms.lock); + + node = interval_tree_iter_first(&p->svms.objects, addr, last); + while (node) { + struct interval_tree_node *next_node; + unsigned long next_start; + + prange = container_of(node, struct svm_range, it_node); + next_node = interval_tree_iter_next(node, addr, last); + next_start = min(node->last, last) + 1; + + if (atomic_add_unless(&prange->queue_refcount, -1, 0)) { + list_for_each_entry(pchild, &prange->child_list, child_list) + atomic_add_unless(&pchild->queue_refcount, -1, 0); + } + + node = next_node; + addr = next_start; + } + + mutex_unlock(&p->svms.lock); +} +#else + +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + return -EINVAL; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ +} + +#endif + +int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, + u64 expected_size) +{ + struct amdgpu_bo_va_mapping *mapping; + u64 user_addr; + u64 size; + + user_addr = (u64)addr >> AMDGPU_GPU_PAGE_SHIFT; + size = expected_size >> AMDGPU_GPU_PAGE_SHIFT; + + mapping = amdgpu_vm_bo_lookup_mapping(vm, user_addr); + if (!mapping) + goto out_err; + + if (user_addr != mapping->start || + (size != 0 && user_addr + size - 1 != mapping->last)) { + pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n", + expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT, + (mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT); + goto out_err; + } + + *pbo = amdgpu_bo_ref(mapping->bo_va->base.bo); + mapping->bo_va->queue_refcount++; + return 0; + +out_err: + *pbo = NULL; + return -EINVAL; +} + +/* FIXME: remove this function, just call amdgpu_bo_unref directly */ +void kfd_queue_buffer_put(struct amdgpu_bo **bo) +{ + amdgpu_bo_unref(bo); +} + +int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) +{ + struct kfd_topology_device *topo_dev; + u64 expected_queue_size; + struct amdgpu_vm *vm; + u32 total_cwsr_size; + int err; + + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + + /* AQL queues on GFX7 and GFX8 appear twice their actual size */ + if (properties->type == KFD_QUEUE_TYPE_COMPUTE && + properties->format == KFD_QUEUE_FORMAT_AQL && + topo_dev->node_props.gfx_target_version >= 70000 && + topo_dev->node_props.gfx_target_version < 90000) + expected_queue_size = properties->queue_size / 2; + else + expected_queue_size = properties->queue_size; + + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE); + if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE); + if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, + &properties->ring_bo, expected_queue_size); + if (err) + goto out_err_unreserve; + + /* only compute queue requires EOP buffer and CWSR area */ + if (properties->type != KFD_QUEUE_TYPE_COMPUTE) + goto out_unreserve; + + /* EOP buffer is not required for all ASICs */ + if (properties->eop_ring_buffer_address) { + if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { + pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n", + properties->eop_ring_buffer_size, + topo_dev->node_props.eop_buffer_size); + err = -EINVAL; + goto out_err_unreserve; + } + err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, + &properties->eop_buf_bo, + properties->eop_ring_buffer_size); + if (err) + goto out_err_unreserve; + } + + if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) { + pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n", + properties->ctl_stack_size, + topo_dev->node_props.ctl_stack_size); + err = -EINVAL; + goto out_err_unreserve; + } + + if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + properties->ctx_save_restore_area_size, + topo_dev->node_props.cwsr_size); + err = -EINVAL; + goto out_err_unreserve; + } + + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + + err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, + &properties->cwsr_bo, total_cwsr_size); + if (!err) + goto out_unreserve; + + amdgpu_bo_unreserve(vm->root.bo); + + err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, + total_cwsr_size); + if (err) + goto out_err_release; + + return 0; + +out_unreserve: + amdgpu_bo_unreserve(vm->root.bo); + return 0; + +out_err_unreserve: + amdgpu_bo_unreserve(vm->root.bo); +out_err_release: + /* FIXME: make a _locked version of this that can be called before + * dropping the VM reservation. + */ + kfd_queue_unref_bo_vas(pdd, properties); + kfd_queue_release_buffers(pdd, properties); + return err; +} + +int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) +{ + struct kfd_topology_device *topo_dev; + u32 total_cwsr_size; + + kfd_queue_buffer_put(&properties->wptr_bo); + kfd_queue_buffer_put(&properties->rptr_bo); + kfd_queue_buffer_put(&properties->ring_bo); + kfd_queue_buffer_put(&properties->eop_buf_bo); + kfd_queue_buffer_put(&properties->cwsr_bo); + + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + + kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); + return 0; +} + +void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo) +{ + if (*bo) { + struct amdgpu_bo_va *bo_va; + + bo_va = amdgpu_vm_bo_find(vm, *bo); + if (bo_va && bo_va->queue_refcount) + bo_va->queue_refcount--; + } +} + +int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, + struct queue_properties *properties) +{ + struct amdgpu_vm *vm; + int err; + + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + kfd_queue_unref_bo_va(vm, &properties->wptr_bo); + kfd_queue_unref_bo_va(vm, &properties->rptr_bo); + kfd_queue_unref_bo_va(vm, &properties->ring_bo); + kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo); + kfd_queue_unref_bo_va(vm, &properties->cwsr_bo); + + amdgpu_bo_unreserve(vm->root.bo); + return 0; +} + +#define SGPR_SIZE_PER_CU 0x4000 +#define LDS_SIZE_PER_CU 0x10000 +#define HWREG_SIZE_PER_CU 0x1000 +#define DEBUGGER_BYTES_ALIGN 64 +#define DEBUGGER_BYTES_PER_WAVE 32 + +static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) +{ + u32 vgpr_size = 0x40000; + + if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */ + gfxv == 90010 || /* GFX_VERSION_ALDEBARAN */ + gfxv == 90008 || /* GFX_VERSION_ARCTURUS */ + gfxv == 90500) + vgpr_size = 0x80000; + else if (gfxv == 110000 || /* GFX_VERSION_PLUM_BONITO */ + gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ + gfxv == 120000 || /* GFX_VERSION_GFX1200 */ + gfxv == 120001) /* GFX_VERSION_GFX1201 */ + vgpr_size = 0x60000; + + return vgpr_size; +} + +#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props) \ + (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\ + (((gfxv) == 90500) ? (props->lds_size_in_kb << 10) : LDS_SIZE_PER_CU) +\ + HWREG_SIZE_PER_CU) + +#define CNTL_STACK_BYTES_PER_WAVE(gfxv) \ + ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/ + +#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40 + +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) +{ + struct kfd_node_properties *props = &dev->node_props; + u32 gfxv = props->gfx_target_version; + u32 ctl_stack_size; + u32 wg_data_size; + u32 wave_num; + u32 cu_num; + + if (gfxv < 80001) /* GFX_VERSION_CARRIZO */ + return; + + cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); + wave_num = (gfxv < 100100) ? /* GFX_VERSION_NAVI10 */ + min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512) + : cu_num * 32; + + wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE); + ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; + ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, + PAGE_SIZE); + + if ((gfxv / 10000 * 10000) == 100000) { + /* HW design limits control stack size to 0x7000. + * This is insufficient for theoretical PM4 cases + * but sufficient for AQL, limited by SPI events. + */ + ctl_stack_size = min(ctl_stack_size, 0x7000); + } + + props->ctl_stack_size = ctl_stack_size; + props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); + props->cwsr_size = ctl_stack_size + wg_data_size; + + if (gfxv == 80002) /* GFX_VERSION_TONGA */ + props->eop_buffer_size = 0x8000; + else if ((gfxv / 100 * 100) == 90400) /* GFX_VERSION_AQUA_VANJARAM */ + props->eop_buffer_size = 4096; + else if (gfxv >= 80000) + props->eop_buffer_size = 4096; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index d9953c2b2661..9b8169761ec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -29,6 +29,7 @@ #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" +#include "amdgpu_reset.h" struct kfd_smi_client { struct list_head list; @@ -43,7 +44,7 @@ struct kfd_smi_client { bool suser; }; -#define MAX_KFIFO_SIZE 1024 +#define KFD_MAX_KFIFO_SIZE 8192 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); @@ -85,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, struct kfd_smi_client *client = filep->private_data; unsigned char *buf; - size = min_t(size_t, size, MAX_KFIFO_SIZE); + size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE); buf = kmalloc(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, add_event_to_kfifo(pid, dev, event, fifo_in, len); } -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context) { unsigned int event; + char reset_cause[64]; if (post_reset) { event = KFD_SMI_EVENT_GPU_POST_RESET; @@ -225,29 +228,37 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + + memset(reset_cause, 0, sizeof(reset_cause)); + + if (reset_context) + amdgpu_reset_get_desc(reset_context, reset_cause, + sizeof(reset_cause)); + + kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( + dev->reset_seq_num, reset_cause)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING( throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) { - struct amdgpu_task_info task_info; - - memset(&task_info, 0, sizeof(struct amdgpu_task_info)); - amdgpu_vm_get_task_info(dev->adev, pasid, &task_info); - /* Report VM faults from user applications, not retry from kernel */ - if (!task_info.pid) - return; - - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info.pid, task_info.task_name); + struct amdgpu_task_info *task_info; + + task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid); + if (task_info) { + /* Report VM faults from user applications, not retry from kernel */ + if (task_info->pid) + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( + task_info->pid, task_info->task_name)); + amdgpu_vm_put_task_info(task_info); + } } void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, @@ -255,16 +266,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -274,34 +285,35 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", + KFD_EVENT_FMT_MIGRATE_START( ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + from, to, prefetch_loc, preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, - uint32_t from, uint32_t to, uint32_t trigger) + uint32_t from, uint32_t to, uint32_t trigger, + int error_code) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", + KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); + from, to, trigger, error_code)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION, - "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid, - node->id, trigger); + KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid, + node->id, trigger)); } void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE, - "%lld -%d %x\n", ktime_get_boottime_ns(), pid, - node->id); + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid, + node->id, 0)); } void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) @@ -318,8 +330,8 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) kfd_smi_event_add(p->lead_thread->pid, pdd->dev, KFD_SMI_EVENT_QUEUE_RESTORE, - "%lld -%d %x %c\n", ktime_get_boottime_ns(), - p->lead_thread->pid, pdd->dev->id, 'R'); + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), + p->lead_thread->pid, pdd->dev->id, 'R')); } kfd_unref_process(p); } @@ -329,8 +341,8 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU, - "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(), - pid, address, last - address + 1, node->id, trigger); + KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(), + pid, address, last - address + 1, node->id, trigger)); } int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) @@ -343,7 +355,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) return -ENOMEM; INIT_LIST_HEAD(&client->list); - ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL); if (ret) { kfree(client); return ret; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index fa95c2dfd587..503bff13d815 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -24,11 +24,14 @@ #ifndef KFD_SMI_EVENTS_H_INCLUDED #define KFD_SMI_EVENTS_H_INCLUDED +struct amdgpu_reset_context; + int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask); -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset); +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context); void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, unsigned long address, bool write_fault, ktime_t ts); @@ -41,7 +44,8 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, - uint32_t from, uint32_t to, uint32_t trigger); + uint32_t from, uint32_t to, uint32_t trigger, + int error_code); void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index c50a0dc9c9c0..9477a4adcd36 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -309,12 +309,13 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) } static void -svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, - uint8_t *granularity, uint32_t *flags) +svm_range_set_default_attributes(struct svm_range_list *svms, int32_t *location, + int32_t *prefetch_loc, uint8_t *granularity, + uint32_t *flags) { *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; - *granularity = 9; + *granularity = svms->default_granularity; *flags = KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; } @@ -358,7 +359,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); - svm_range_set_default_attributes(&prange->preferred_loc, + svm_range_set_default_attributes(svms, &prange->preferred_loc, &prange->prefetch_loc, &prange->granularity, &prange->flags); @@ -404,6 +405,27 @@ static void svm_range_bo_release(struct kref *kref) spin_lock(&svm_bo->list_lock); } spin_unlock(&svm_bo->list_lock); + + if (mmget_not_zero(svm_bo->eviction_fence->mm)) { + struct kfd_process_device *pdd; + struct kfd_process *p; + struct mm_struct *mm; + + mm = svm_bo->eviction_fence->mm; + /* + * The forked child process takes svm_bo device pages ref, svm_bo could be + * released after parent process is gone. + */ + p = kfd_lookup_process_by_mm(mm); + if (p) { + pdd = kfd_get_process_device_data(svm_bo->node, p); + if (pdd) + atomic64_sub(amdgpu_bo_size(svm_bo->bo), &pdd->vram_usage); + kfd_unref_process(p); + } + mmput(mm); + } + if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) /* We're not in the eviction worker. Signal the fence. */ dma_fence_signal(&svm_bo->eviction_fence->base); @@ -531,6 +553,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear) { + struct kfd_process_device *pdd; struct amdgpu_bo_param bp; struct svm_range_bo *svm_bo; struct amdgpu_bo_user *ubo; @@ -622,6 +645,10 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, list_add(&prange->svm_bo_list, &svm_bo->range_list); spin_unlock(&svm_bo->list_lock); + pdd = svm_range_get_pdd_by_node(prange, node); + if (pdd) + atomic64_add(amdgpu_bo_size(bo), &pdd->vram_usage); + return 0; reserve_bo_failed: @@ -1051,6 +1078,7 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, new->mapped_to_gpu = old->mapped_to_gpu; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); + atomic_set(&new->queue_refcount, atomic_read(&old->queue_refcount)); return 0; } @@ -1167,17 +1195,17 @@ svm_range_get_pte_flags(struct kfd_node *node, struct kfd_node *bo_node; uint32_t flags = prange->flags; uint32_t mapping_flags = 0; + uint32_t gc_ip_version = KFD_GC_VERSION(node); uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; - bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) bo_node = prange->svm_bo->node; - switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) { + switch (gc_ip_version) { case IP_VERSION(9, 4, 1): if (domain == SVM_RANGE_VRAM_DOMAIN) { if (bo_node == node) { @@ -1213,15 +1241,16 @@ svm_range_get_pte_flags(struct kfd_node *node, } break; case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + case IP_VERSION(9, 5, 0): if (ext_coherent) - mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_UC; + mtype_local = (gc_ip_version < IP_VERSION(9, 5, 0) && !node->adev->rev_id) ? + AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_CC; else mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; snoop = true; - if (uncached) { - mapping_flags |= AMDGPU_VM_MTYPE_UC; - } else if (domain == SVM_RANGE_VRAM_DOMAIN) { + if (domain == SVM_RANGE_VRAM_DOMAIN) { /* local HBM region close to partition */ if (bo_node->adev == node->adev && (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) @@ -1231,9 +1260,13 @@ svm_range_get_pte_flags(struct kfd_node *node, */ else if (svm_nodes_in_same_hive(bo_node, node) && !ext_coherent) mapping_flags |= AMDGPU_VM_MTYPE_NC; - /* PCIe P2P or extended system scope coherence */ - else + /* PCIe P2P on GPUs pre-9.5.0 */ + else if (gc_ip_version < IP_VERSION(9, 5, 0) && + !svm_nodes_in_same_hive(bo_node, node)) mapping_flags |= AMDGPU_VM_MTYPE_UC; + /* Other remote memory */ + else + mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; /* system memory accessed by the APU */ } else if (node->adev->flags & AMD_IS_APU) { /* On NUMA systems, locality is determined per-page @@ -1245,9 +1278,16 @@ svm_range_get_pte_flags(struct kfd_node *node, mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; /* system memory accessed by the dGPU */ } else { - mapping_flags |= AMDGPU_VM_MTYPE_UC; + if (gc_ip_version < IP_VERSION(9, 5, 0)) + mapping_flags |= AMDGPU_VM_MTYPE_UC; + else + mapping_flags |= AMDGPU_VM_MTYPE_NC; } break; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + mapping_flags |= AMDGPU_VM_MTYPE_NC; + break; default: mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; @@ -1263,6 +1303,8 @@ svm_range_get_pte_flags(struct kfd_node *node, pte_flags = AMDGPU_PTE_VALID; pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; + if (gc_ip_version >= IP_VERSION(12, 0, 0)) + pte_flags |= AMDGPU_PTE_IS_PTE; pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags); return pte_flags; @@ -1515,9 +1557,9 @@ static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr) goto unreserve_out; } - r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, - drm_priv_to_vm(pdd->drm_priv), - svm_range_bo_validate, NULL); + r = amdgpu_vm_validate(pdd->dev->adev, + drm_priv_to_vm(pdd->drm_priv), NULL, + svm_range_bo_validate, NULL); if (r) { pr_debug("failed %d validate pt bos\n", r); goto unreserve_out; @@ -1641,7 +1683,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, goto free_ctx; } - svm_range_reserve_bos(ctx, intr); + r = svm_range_reserve_bos(ctx, intr); + if (r) + goto free_ctx; p = container_of(prange->svms, struct kfd_process, svms); owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap, @@ -1656,7 +1700,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1676,11 +1720,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, readonly, owner, NULL, &hmm_range); WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { + if (r) pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - } } else { r = -EFAULT; } @@ -1694,7 +1735,12 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + /* Free backing memory of hmm_range if it was initialized + * Overrride return value to TRY AGAIN only if prior returns + * were successful + */ + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; } @@ -1978,6 +2024,7 @@ static struct svm_range *svm_range_clone(struct svm_range *old) new->vram_pages = old->vram_pages; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); + atomic_set(&new->queue_refcount, atomic_read(&old->queue_refcount)); return new; } @@ -2246,16 +2293,10 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms) { struct kfd_process_device *pdd; struct kfd_process *p; - int drain; uint32_t i; p = container_of(svms, struct kfd_process, svms); -restart: - drain = atomic_read(&svms->drain_pagefaults); - if (!drain) - return; - for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { pdd = p->pdds[i]; if (!pdd) @@ -2275,8 +2316,6 @@ restart: pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); } - if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) - goto restart; } static void svm_range_deferred_list_work(struct work_struct *work) @@ -2298,17 +2337,8 @@ static void svm_range_deferred_list_work(struct work_struct *work) prange->start, prange->last, prange->work_item.op); mm = prange->work_item.mm; -retry: - mmap_write_lock(mm); - /* Checking for the need to drain retry faults must be inside - * mmap write lock to serialize with munmap notifiers. - */ - if (unlikely(atomic_read(&svms->drain_pagefaults))) { - mmap_write_unlock(mm); - svm_range_drain_retry_fault(svms); - goto retry; - } + mmap_write_lock(mm); /* Remove from deferred_list must be inside mmap write lock, for * two race cases: @@ -2429,6 +2459,17 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, struct kfd_process *p; unsigned long s, l; bool unmap_parent; + uint32_t i; + + if (atomic_read(&prange->queue_refcount)) { + int r; + + pr_warn("Freeing queue vital buffer 0x%lx, queue evicted\n", + prange->start << PAGE_SHIFT); + r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM); + if (r) + pr_debug("failed %d to quiesce KFD queues\n", r); + } p = kfd_lookup_process_by_mm(mm); if (!p) @@ -2438,11 +2479,38 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, prange, prange->start, prange->last, start, last); - /* Make sure pending page faults are drained in the deferred worker - * before the range is freed to avoid straggler interrupts on - * unmapped memory causing "phantom faults". + /* calculate time stamps that are used to decide which page faults need be + * dropped or handled before unmap pages from gpu vm */ - atomic_inc(&svms->drain_pagefaults); + for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { + struct kfd_process_device *pdd; + struct amdgpu_device *adev; + struct amdgpu_ih_ring *ih; + uint32_t checkpoint_wptr; + + pdd = p->pdds[i]; + if (!pdd) + continue; + + adev = pdd->dev->adev; + + /* Check and drain ih1 ring if cam not available */ + if (adev->irq.ih1.ring_size) { + ih = &adev->irq.ih1; + checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); + if (ih->rptr != checkpoint_wptr) { + svms->checkpoint_ts[i] = + amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1); + continue; + } + } + + /* check if dev->irq.ih_soft is not empty */ + ih = &adev->irq.ih_soft; + checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); + if (ih->rptr != checkpoint_wptr) + svms->checkpoint_ts[i] = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1); + } unmap_parent = start <= prange->start && last >= prange->last; @@ -2617,7 +2685,7 @@ svm_range_best_restore_location(struct svm_range *prange, return -1; } - if (node->adev->gmc.is_app_apu) + if (node->adev->flags & AMD_IS_APU) return 0; if (prange->preferred_loc == gpuid || @@ -2666,9 +2734,10 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); start_limit = max(vma->vm_start >> PAGE_SHIFT, - (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); + (unsigned long)ALIGN_DOWN(addr, 1UL << p->svms.default_granularity)); end_limit = min(vma->vm_end >> PAGE_SHIFT, - (unsigned long)ALIGN(addr + 1, 2UL << 8)); + (unsigned long)ALIGN(addr + 1, 1UL << p->svms.default_granularity)); + /* First range that starts after the fault address */ node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); if (node) { @@ -2883,7 +2952,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t vmid, uint32_t node_id, - uint64_t addr, bool write_fault) + uint64_t addr, uint64_t ts, bool write_fault) { unsigned long start, last, size; struct mm_struct *mm = NULL; @@ -2893,7 +2962,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, ktime_t timestamp = ktime_get_boottime(); struct kfd_node *node; int32_t best_loc; - int32_t gpuidx = MAX_GPU_INSTANCE; + int32_t gpuid, gpuidx = MAX_GPU_INSTANCE; bool write_locked = false; struct vm_area_struct *vma; bool migration = false; @@ -2914,11 +2983,38 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); if (atomic_read(&svms->drain_pagefaults)) { - pr_debug("draining retry fault, drop fault 0x%llx\n", addr); + pr_debug("page fault handling disabled, drop fault 0x%llx\n", addr); r = 0; goto out; } + node = kfd_node_by_irq_ids(adev, node_id, vmid); + if (!node) { + pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id, + vmid); + r = -EFAULT; + goto out; + } + + if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) { + pr_debug("failed to get gpuid/gpuidex for node_id: %d\n", node_id); + r = -EFAULT; + goto out; + } + + /* check if this page fault time stamp is before svms->checkpoint_ts */ + if (svms->checkpoint_ts[gpuidx] != 0) { + if (amdgpu_ih_ts_after(ts, svms->checkpoint_ts[gpuidx])) { + pr_debug("draining retry fault, drop fault 0x%llx\n", addr); + r = 0; + goto out; + } else + /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts + * to zero to avoid following ts wrap around give wrong comparing + */ + svms->checkpoint_ts[gpuidx] = 0; + } + if (!p->xnack_enabled) { pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); r = -EFAULT; @@ -2935,13 +3031,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out; } - node = kfd_node_by_irq_ids(adev, node_id, vmid); - if (!node) { - pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id, - vmid); - r = -EFAULT; - goto out; - } mmap_read_lock(mm); retry_write_locked: mutex_lock(&svms->lock); @@ -3026,8 +3115,6 @@ retry_write_locked: start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start); last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last); if (prange->actual_loc != 0 || best_loc != 0) { - migration = true; - if (best_loc) { r = svm_migrate_to_vram(prange, best_loc, start, last, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); @@ -3050,7 +3137,9 @@ retry_write_locked: if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", r, svms, start, last); - goto out_unlock_range; + goto out_migrate_fail; + } else { + migration = true; } } @@ -3060,6 +3149,7 @@ retry_write_locked: pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", r, svms, start, last); +out_migrate_fail: kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, migration); @@ -3156,8 +3246,9 @@ void svm_range_list_fini(struct kfd_process *p) /* * Ensure no retry fault comes in afterwards, as page fault handler will * not find kfd process and take mm lock to recover fault. + * stop kfd page fault handing, then wait pending page faults got drained */ - atomic_inc(&p->svms.drain_pagefaults); + atomic_set(&p->svms.drain_pagefaults, 1); svm_range_drain_retry_fault(&p->svms); list_for_each_entry_safe(prange, next, &p->svms.list, list) { @@ -3191,6 +3282,12 @@ int svm_range_list_init(struct kfd_process *p) if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev)) bitmap_set(svms->bitmap_supported, i, 1); + /* Value of default granularity cannot exceed 0x1B, the + * number of pages supported by a 4-level paging table + */ + svms->default_granularity = min_t(u8, amdgpu_svm_default_granularity, 0x1B); + pr_debug("Default SVM Granularity to use: %d\n", svms->default_granularity); + return 0; } @@ -3335,7 +3432,7 @@ svm_range_best_prefetch_location(struct svm_range *prange) goto out; } - if (bo_node->adev->gmc.is_app_apu) { + if (bo_node->adev->flags & AMD_IS_APU) { best_loc = 0; goto out; } @@ -3424,7 +3521,7 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); *migrated = !r; - return r; + return 0; } int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) @@ -3718,7 +3815,7 @@ svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, node = interval_tree_iter_first(&svms->objects, start, last); if (!node) { pr_debug("range attrs not found return default values\n"); - svm_range_set_default_attributes(&location, &prefetch_loc, + svm_range_set_default_attributes(svms, &location, &prefetch_loc, &granularity, &flags_and); flags_or = flags_and; if (p->xnack_enabled) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 026863a0abcd..bddd24f04669 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -137,6 +137,7 @@ struct svm_range { DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); bool mapped_to_gpu; + atomic_t queue_refcount; }; static inline void svm_range_lock(struct svm_range *prange) @@ -173,7 +174,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear); void svm_range_vram_node_free(struct svm_range *prange); int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, - uint32_t vmid, uint32_t node_id, uint64_t addr, + uint32_t vmid, uint32_t node_id, uint64_t addr, uint64_t ts, bool write_fault); int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence); void svm_range_add_list_work(struct svm_range_list *svms, @@ -201,7 +202,7 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s * is initialized to not 0 when page migration register device memory. */ #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\ - (adev)->gmc.is_app_apu) + ((adev)->flags & AMD_IS_APU)) void svm_range_bo_unref_async(struct svm_range_bo *svm_bo); @@ -224,7 +225,7 @@ static inline void svm_range_list_fini(struct kfd_process *p) static inline int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t client_id, uint32_t node_id, - uint64_t addr, bool write_fault) + uint64_t addr, uint64_t ts, bool write_fault) { return -EFAULT; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 6ed2ec381aaa..ceb9fb475ef1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -31,6 +31,7 @@ #include <linux/log2.h> #include <linux/dmi.h> #include <linux/atomic.h> +#include <linux/crc16.h> #include "kfd_priv.h" #include "kfd_crat.h" @@ -291,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, iolink->max_bandwidth); sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", iolink->rec_transfer_size); + sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask", + iolink->rec_sdma_eng_id_mask); sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); return offs; @@ -958,8 +961,7 @@ static void kfd_update_system_properties(void) dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); if (dev) { - sys_props.platform_id = - (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_id = dev->oem_id64; sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); sys_props.platform_rev = dev->oem_revision; } @@ -1092,14 +1094,17 @@ void kfd_topology_shutdown(void) static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) { - uint32_t hashout; + uint32_t gpu_id; uint32_t buf[8]; uint64_t local_mem_size; - int i; + struct kfd_topology_device *dev; + bool is_unique; + uint8_t *crc_buf; if (!gpu) return 0; + crc_buf = (uint8_t *)&buf; local_mem_size = gpu->local_mem_info.local_mem_size_private + gpu->local_mem_info.local_mem_size_public; buf[0] = gpu->adev->pdev->devfn; @@ -1112,10 +1117,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) buf[6] = upper_32_bits(local_mem_size); buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16); - for (i = 0, hashout = 0; i < 8; i++) - hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + gpu_id = crc16(0, crc_buf, sizeof(buf)) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); - return hashout; + /* There is a very small possibility when generating a + * 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer + * that the value could be 0 or non-unique. So, check if + * it is unique and non-zero. If not unique increment till + * unique one is found. In case of overflow, restart from 1 + */ + + down_read(&topology_lock); + do { + is_unique = true; + if (!gpu_id) + gpu_id = 1; + list_for_each_entry(dev, &topology_device_list, list) { + if (dev->gpu && dev->gpu_id == gpu_id) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) + gpu_id = (gpu_id + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + } while (!is_unique); + up_read(&topology_lock); + + return gpu_id; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If * the GPU device is not already present in the topology device @@ -1238,6 +1267,54 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev, } } +#define REC_SDMA_NUM_GPU 8 +static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = { + { -1, 14, 12, 2, 4, 8, 10, 6 }, + { 14, -1, 2, 10, 8, 4, 6, 12 }, + { 10, 2, -1, 12, 14, 6, 4, 8 }, + { 2, 12, 10, -1, 6, 14, 8, 4 }, + { 4, 8, 14, 6, -1, 10, 12, 2 }, + { 8, 4, 6, 14, 12, -1, 2, 10 }, + { 10, 6, 4, 8, 12, 2, -1, 14 }, + { 6, 12, 8, 4, 2, 10, 14, -1 }}; + +static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev, + struct kfd_iolink_properties *outbound_link, + struct kfd_iolink_properties *inbound_link) +{ + struct kfd_node *gpu = outbound_link->gpu; + struct amdgpu_device *adev = gpu->adev; + int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu && + adev->aid_mask && num_xgmi_nodes && gpu->kfd->num_nodes == 1 && + kfd_get_num_xgmi_sdma_engines(gpu) >= 14 && + (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8); + + if (support_rec_eng) { + int src_socket_id = adev->gmc.xgmi.physical_node_id; + int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id; + + outbound_link->rec_sdma_eng_id_mask = + 1 << rec_sdma_eng_map[src_socket_id][dst_socket_id]; + inbound_link->rec_sdma_eng_id_mask = + 1 << rec_sdma_eng_map[dst_socket_id][src_socket_id]; + } else { + int num_sdma_eng = kfd_get_num_sdma_engines(gpu); + int i, eng_offset = 0; + + if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && + kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) { + eng_offset = num_sdma_eng; + num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu); + } + + for (i = 0; i < num_sdma_eng; i++) { + outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); + inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); + } + } +} + static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) { struct kfd_iolink_properties *link, *inbound_link; @@ -1276,6 +1353,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED; kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link); kfd_set_iolink_non_coherent(peer_dev, link, inbound_link); + kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link); } } @@ -1564,6 +1642,7 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, pcache->processor_id_low = cu_processor_id + (first_active_cu - 1); pcache->cache_level = pcache_info[cache_type].cache_level; pcache->cache_size = pcache_info[cache_type].cache_size; + pcache->cacheline_size = pcache_info[cache_type].cache_line_size; if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE) pcache->cache_type |= HSA_CACHE_TYPE_DATA; @@ -1632,8 +1711,11 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, pcache->processor_id_low = cu_processor_id + (first_active_cu - 1); pcache->cache_level = pcache_info[cache_type].cache_level; + pcache->cacheline_size = pcache_info[cache_type].cache_line_size; - if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 5, 0)) mode = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); else mode = UNKNOWN_MEMORY_PARTITION_MODE; @@ -1695,7 +1777,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info; struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config; int gpu_processor_id; - struct kfd_cache_properties *props_ext; + struct kfd_cache_properties *props_ext = NULL; int num_of_entries = 0; int num_of_cache_types = 0; struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES]; @@ -1703,6 +1785,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct gpu_processor_id = dev->node_props.simd_id_base; + memset(cache_info, 0, sizeof(cache_info)); pcache_info = cache_info; num_of_cache_types = kfd_get_gpu_cache_info(kdev, &pcache_info); if (!num_of_cache_types) { @@ -1769,7 +1852,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct pr_debug("Added [%d] GPU cache entries\n", num_of_entries); } -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, +static int kfd_topology_add_device_locked(struct kfd_node *gpu, struct kfd_topology_device **dev) { int proximity_domain = ++topology_crat_proximity_domain; @@ -1782,8 +1865,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, COMPUTE_UNIT_GPU, gpu, proximity_domain); if (res) { - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error creating VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1794,8 +1876,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, &temp_topology_device_list, proximity_domain); if (res) { - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error parsing VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1821,8 +1902,8 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, if (!res) sys_props.generation_count++; else - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", - gpu_id, res); + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs topology. res=%d\n", + res); err: kfd_destroy_crat_image(crat_image); @@ -1905,7 +1986,8 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { - if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 4)) dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9_4_3 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT_GFX9_4_3; @@ -1917,6 +1999,8 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 4, 2)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + + dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED; } else { dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT; @@ -1924,6 +2008,10 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0)) + dev->node_props.capability |= + HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED; } kfd_topology_set_dbg_firmware_support(dev); @@ -1939,14 +2027,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config; struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { dev_warn(gpu->adev->dev, - "Won't add GPU (ID: 0x%x) to topology since it has no drm node assigned.", - gpu_id); + "Won't add GPU to topology since it has no drm node assigned."); return 0; } else { - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n"); } /* Check to see if this gpu device exists in the topology_device_list. @@ -1958,11 +2044,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) down_write(&topology_lock); dev = kfd_assign_gpu(gpu); if (!dev) - res = kfd_topology_add_device_locked(gpu, gpu_id, &dev); + res = kfd_topology_add_device_locked(gpu, &dev); up_write(&topology_lock); if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id; @@ -1994,7 +2081,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) HSA_CAP_ASIC_REVISION_MASK); dev->node_props.location_id = pci_dev_id(gpu->adev->pdev); - if (KFD_GC_VERSION(dev->gpu->kfd) == IP_VERSION(9, 4, 3)) + if (gpu->kfd->num_nodes > 1) dev->node_props.location_id |= dev->gpu->node_id; dev->node_props.domain = pci_domain_nr(gpu->adev->pdev->bus); @@ -2087,6 +2174,8 @@ int kfd_topology_add_device(struct kfd_node *gpu) dev->gpu->adev->gmc.xgmi.connected_to_cpu) dev->node_props.capability |= HSA_CAP_FLAGS_COHERENTHOSTACCESS; + kfd_queue_ctx_save_restore_size(dev); + kfd_debug_print_topology(); kfd_notify_gpu_change(gpu_id, 1); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 27386ce9a021..155b5c410af1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -74,6 +74,10 @@ struct kfd_node_properties { uint32_t num_sdma_xgmi_engines; uint32_t num_sdma_queues_per_engine; uint32_t num_cp_queues; + uint32_t cwsr_size; + uint32_t ctl_stack_size; + uint32_t eop_buffer_size; + uint32_t debug_memory_size; char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; }; @@ -121,6 +125,7 @@ struct kfd_iolink_properties { uint32_t min_bandwidth; uint32_t max_bandwidth; uint32_t rec_transfer_size; + uint32_t rec_sdma_eng_id_mask; uint32_t flags; struct kfd_node *gpu; struct kobject *kobj; @@ -154,7 +159,10 @@ struct kfd_topology_device { struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; - uint8_t oem_id[CRAT_OEMID_LENGTH]; + union { + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint64_t oem_id64; + }; uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; uint32_t oem_revision; }; diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h index 10138676f27f..e5c0205f2618 100644 --- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h +++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h @@ -29,6 +29,7 @@ #define SOC15_INTSRC_CP_BAD_OPCODE 183 #define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 #define SOC15_INTSRC_VMC_FAULT 0 +#define SOC15_INTSRC_VMC_UTCL2_POISON 1 #define SOC15_INTSRC_SDMA_TRAP 224 #define SOC15_INTSRC_SDMA_ECC 220 #define SOC21_INTSRC_SDMA_TRAP 49 |