diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
55 files changed, 8020 insertions, 3038 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index d3c3d3ab7225..62e88e5362e9 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -5,7 +5,7 @@ config HSA_AMD bool "HSA kernel driver for AMD GPU devices" - depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64) + depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64 || (RISCV && 64BIT)) select HMM_MIRROR select MMU_NOTIFIER select DRM_AMDGPU_USERPTR diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index a5ae7bcf44eb..0ce08113c9f0 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -27,7 +27,6 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_device.o \ $(AMDKFD_PATH)/kfd_chardev.o \ $(AMDKFD_PATH)/kfd_topology.o \ - $(AMDKFD_PATH)/kfd_pasid.o \ $(AMDKFD_PATH)/kfd_doorbell.o \ $(AMDKFD_PATH)/kfd_flat_memory.o \ $(AMDKFD_PATH)/kfd_process.o \ @@ -38,6 +37,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v11.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v12.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_packet_manager_vi.o \ @@ -49,6 +49,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v11.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v12.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 795382b55e0a..73acbe0b7c21 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -91,7 +91,6 @@ static void cik_event_interrupt_wq(struct kfd_node *dev, const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; uint32_t context_id = ihre->data & 0xfffffff; - unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8; u32 pasid = (ihre->ring_id & 0xffff0000) >> 16; if (pasid == 0) @@ -107,20 +106,26 @@ static void cik_event_interrupt_wq(struct kfd_node *dev, kfd_signal_hw_exception_event(pasid); else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { + struct kfd_process_device *pdd = NULL; struct kfd_vm_fault_info info; + struct kfd_process *p; kfd_smi_event_update_vmfault(dev, pasid); - kfd_dqm_evict_pasid(dev->dqm, pasid); + p = kfd_lookup_process_by_pasid(pasid, &pdd); + if (!pdd) + return; + + kfd_evict_process_device(pdd); memset(&info, 0, sizeof(info)); amdgpu_amdkfd_gpuvm_get_vm_fault_info(dev->adev, &info); - if (!info.page_addr && !info.status) + if (!info.page_addr && !info.status) { + kfd_unref_process(p); return; + } - if (info.vmid == vmid) - kfd_signal_vm_fault_event(dev, pasid, &info, NULL); - else - kfd_signal_vm_fault_event(dev, pasid, NULL, NULL); + kfd_signal_vm_fault_event(pdd, &info, NULL); + kfd_unref_process(p); } } diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 5a0308d26b53..0320163b6e74 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -274,7 +274,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf820258, + 0xbf820001, 0xbf820259, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -390,141 +390,98 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0xb8fb1605, 0x807b817b, - 0x8e7b847b, 0x8e76827b, - 0xbef600ff, 0x01000000, - 0xbef20174, 0x80747074, - 0x82758075, 0xbefc0080, - 0xbf800000, 0xbe802b00, - 0xbe822b02, 0xbe842b04, - 0xbe862b06, 0xbe882b08, - 0xbe8a2b0a, 0xbe8c2b0c, - 0xbe8e2b0e, 0xc06b003a, - 0x00000000, 0xbf8cc07f, - 0xc06b013a, 0x00000010, - 0xbf8cc07f, 0xc06b023a, - 0x00000020, 0xbf8cc07f, - 0xc06b033a, 0x00000030, - 0xbf8cc07f, 0x8074c074, - 0x82758075, 0x807c907c, - 0xbf0a7b7c, 0xbf85ffe7, - 0xbef40172, 0xbef00080, - 0xbefe00c1, 0xbeff00c1, - 0xbee80080, 0xbee90080, - 0xbef600ff, 0x01000000, - 0x867aff78, 0x00400000, - 0xbf850003, 0xb8faf803, - 0x897a7aff, 0x10000000, - 0xbf85004d, 0xbe840080, - 0xd2890000, 0x00000900, - 0x80048104, 0xd2890001, - 0x00000900, 0x80048104, - 0xd2890002, 0x00000900, - 0x80048104, 0xd2890003, - 0x00000900, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8fb1605, + 0x807b817b, 0x8e7b847b, + 0x8e76827b, 0xbef600ff, + 0x01000000, 0xbef20174, + 0x80747074, 0x82758075, + 0xbefc0080, 0xbf800000, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003a, 0x00000000, + 0xbf8cc07f, 0xc06b013a, + 0x00000010, 0xbf8cc07f, + 0xc06b023a, 0x00000020, + 0xbf8cc07f, 0xc06b033a, + 0x00000030, 0xbf8cc07f, + 0x8074c074, 0x82758075, + 0x807c907c, 0xbf0a7b7c, + 0xbf85ffe7, 0xbef40172, + 0xbef00080, 0xbefe00c1, + 0xbeff00c1, 0xbee80080, + 0xbee90080, 0xbef600ff, + 0x01000000, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf85004d, 0xbe840080, 0xd2890000, - 0x00000901, 0x80048104, - 0xd2890001, 0x00000901, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, 0x80048104, 0xd2890002, - 0x00000901, 0x80048104, - 0xd2890003, 0x00000901, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000902, + 0xd2890000, 0x00000901, 0x80048104, 0xd2890001, - 0x00000902, 0x80048104, - 0xd2890002, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, 0x80048104, 0xd2890003, - 0x00000902, 0x80048104, + 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000903, 0x80048104, - 0xd2890001, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, 0x80048104, 0xd2890002, - 0x00000903, 0x80048104, - 0xd2890003, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbf820008, - 0xe0724000, 0x701d0000, - 0xe0724100, 0x701d0100, - 0xe0724200, 0x701d0200, - 0xe0724300, 0x701d0300, - 0xbefe00c1, 0xbeff00c1, - 0xb8fb4306, 0x867bc17b, - 0xbf840063, 0xbf8a0000, - 0x867aff6f, 0x04000000, - 0xbf84005f, 0x8e7b867b, - 0x8e7b827b, 0xbef6007b, - 0xb8f02a05, 0x80708170, - 0x8e708a70, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, - 0x867aff78, 0x00400000, - 0xbf850003, 0xb8faf803, - 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, - 0xd2890000, 0x00000900, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, 0x80048104, 0xd2890001, - 0x00000900, 0x80048104, - 0xd2890002, 0x00000900, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, 0x80048104, 0xd2890003, - 0x00000900, 0x80048104, + 0x00000903, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000901, 0x80048104, - 0xd2890001, 0x00000901, - 0x80048104, 0xd2890002, - 0x00000901, 0x80048104, - 0xd2890003, 0x00000901, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8fb2a05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xbf820008, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0xbefe00c1, + 0xbeff00c1, 0xb8fb4306, + 0x867bc17b, 0xbf840063, + 0xbf8a0000, 0x867aff6f, + 0x04000000, 0xbf84005f, + 0x8e7b867b, 0x8e7b827b, + 0xbef6007b, 0xb8f02a05, + 0x80708170, 0x8e708a70, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -544,141 +501,185 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850051, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xbf8200c7, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x866eff7f, 0x04000000, - 0xbf84001e, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf840019, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbefe00c1, - 0xbeff00c1, 0xbef600ff, - 0x01000000, 0xb8ef2a05, - 0x806f816f, 0x8e6f826f, - 0x806fff6f, 0x00008000, - 0xbef80080, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x6e1d0000, 0xe0524100, - 0x6e1d0100, 0xe0524200, - 0x6e1d0200, 0xe0524300, - 0x6e1d0300, 0xbf8c0f70, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200c7, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x866eff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef4306, 0x866fc16f, + 0xbf840019, 0x8e6f866f, + 0x8e6f826f, 0xbef6006f, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, + 0xbefe00c1, 0xbeff00c1, 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x6e1d0000, + 0xe0524100, 0x6e1d0100, + 0xe0524200, 0x6e1d0200, + 0xe0524300, 0x6e1d0300, + 0xbf8c0f70, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, 0x80786e78, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xc0211bfa, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0xbef60084, + 0xbef600ff, 0x01000000, + 0xc0211bfa, 0x00000078, + 0x80788478, 0xc0211b3a, 0x00000078, 0x80788478, - 0xc0211b3a, 0x00000078, - 0x80788478, 0xc0211b7a, + 0xc0211b7a, 0x00000078, + 0x80788478, 0xc0211c3a, 0x00000078, 0x80788478, - 0xc0211c3a, 0x00000078, - 0x80788478, 0xc0211c7a, + 0xc0211c7a, 0x00000078, + 0x80788478, 0xc0211eba, 0x00000078, 0x80788478, - 0xc0211eba, 0x00000078, - 0x80788478, 0xc0211efa, + 0xc0211efa, 0x00000078, + 0x80788478, 0xc0211a3a, 0x00000078, 0x80788478, - 0xc0211a3a, 0x00000078, - 0x80788478, 0xc0211a7a, + 0xc0211a7a, 0x00000078, + 0x80788478, 0xc0211cfa, 0x00000078, 0x80788478, - 0xc0211cfa, 0x00000078, - 0x80788478, 0xbf8cc07f, - 0xbefc006f, 0xbefe0070, - 0xbeff0071, 0x866f7bff, - 0x000003ff, 0xb96f4803, - 0x866f7bff, 0xfffff800, - 0x8f6f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xbf8cc07f, 0xbefc006f, + 0xbefe0070, 0xbeff0071, + 0x866f7bff, 0x000003ff, + 0xb96f4803, 0x866f7bff, + 0xfffff800, 0x8f6f8b6f, + 0xb96fa2c3, 0xb973f801, + 0xb8ee2a05, 0x806e816e, + 0x8e6e8a6e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; static const uint32_t cwsr_trap_nv1x_hex[] = { - 0xbf820001, 0xbf820394, + 0xbf820001, 0xbf820393, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -711,19 +712,19 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c0000, 0x8f6e976e, 0x8a77ff77, 0x00800000, 0x88776e77, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, + 0xfa000000, 0xbf8c0000, 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c0000, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff0000, - 0xbf850005, 0x8878ff78, - 0x00002000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x01000000, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff0000, + 0xbf850008, 0x876eff6d, + 0x01000000, 0xbf850007, + 0x8878ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x0000ffff, 0x907a8977, 0x877bff7a, 0x003f8000, @@ -932,23 +933,48 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840086, 0xbf8a0000, + 0xbf840085, 0xbf8a0000, 0x877aff6d, 0x80000000, - 0xbf840082, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, - 0x8070ff70, 0x00000080, - 0xbef603ff, 0x01000000, - 0xd7650000, 0x000100c1, - 0xd7660000, 0x000200c1, - 0x16000084, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850033, + 0xbf840081, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef603ff, + 0x01000000, 0xd7650000, + 0x000100c1, 0xd7660000, + 0x000200c1, 0x16000084, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850033, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf85001d, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xbe840380, 0xd7600000, + 0x00000901, 0x80048104, + 0xd7600001, 0x00000901, + 0x80048104, 0xd7600002, + 0x00000901, 0x80048104, + 0xd7600003, 0x00000901, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807cff7c, 0x00000080, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7c, + 0xbf85ffe4, 0xbf820044, + 0xbe8303ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff4, 0xbf820032, 0xb97af803, 0x8a7a7aff, 0x10000000, 0xbf85001d, 0xd8d80000, 0x01000000, @@ -960,24 +986,45 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000901, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0x807cff7c, - 0x00000080, 0xd5250000, - 0x0001ff00, 0x00000080, + 0x00000100, 0xd5250000, + 0x0001ff00, 0x00000100, 0xbf0a7b7c, 0xbf85ffe4, - 0xbf820044, 0xbe8303ff, - 0x00000080, 0xbf800000, + 0xbf820011, 0xbe8303ff, + 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, 0xe0704000, 0x705d0100, 0x807c037c, 0x80700370, 0xd5250000, - 0x0001ff00, 0x00000080, + 0x0001ff00, 0x00000100, 0xbf0a7b7c, 0xbf85fff4, - 0xbf820032, 0xb97af803, - 0x8a7a7aff, 0x10000000, - 0xbf85001d, 0xd8d80000, - 0x01000000, 0xbf8c0000, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x00000200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x00000400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf85006b, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf8400fa, + 0xb97af803, 0x8a7a7aff, + 0x10000000, 0xbf850050, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xbe840380, 0xd7600000, + 0x00000900, 0x80048104, + 0xd7600001, 0x00000900, + 0x80048104, 0xd7600002, + 0x00000900, 0x80048104, + 0xd7600003, 0x00000900, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000901, 0x80048104, 0xd7600001, 0x00000901, @@ -986,32 +1033,39 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xd7600003, 0x00000901, 0x80048104, 0xf469003a, 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0x807cff7c, 0x00000100, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85ffe4, 0xbf820011, - 0xbe8303ff, 0x00000100, - 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85fff4, 0xbefe03c1, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850004, - 0xbef003ff, 0x00000200, - 0xbeff0380, 0xbf820003, - 0xbef003ff, 0x00000400, - 0xbeff03c1, 0xb97b3a05, - 0x807b817b, 0x8f7b827b, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf85006b, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000902, 0x80048104, + 0xd7600001, 0x00000902, + 0x80048104, 0xd7600002, + 0x00000902, 0x80048104, + 0xd7600003, 0x00000902, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000903, 0x80048104, + 0xd7600001, 0x00000903, + 0x80048104, 0xd7600002, + 0x00000903, 0x80048104, + 0xd7600003, 0x00000903, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffb1, 0xbf8200a6, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000200, 0xbf0a7b7c, + 0xbf85ffef, 0xbf820094, 0xbef603ff, 0x01000000, 0xbefc0384, 0xbf0a7b7c, - 0xbf8400fa, 0xb97af803, + 0xbf840065, 0xb97af803, 0x8a7a7aff, 0x10000000, 0xbf850050, 0x7e008700, 0x7e028701, 0x7e048702, @@ -1023,7 +1077,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000900, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000901, 0x80048104, 0xd7600001, @@ -1032,7 +1086,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000901, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000902, 0x80048104, 0xd7600001, @@ -1041,7 +1095,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000902, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000903, 0x80048104, 0xd7600001, @@ -1050,25 +1104,24 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000903, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0x807c847c, 0xbf0a7b7c, 0xbf85ffb1, - 0xbf8200a6, 0x7e008700, + 0xbf82003b, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, 0x705d0300, 0x807c847c, - 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000400, 0xbf0a7b7c, 0xbf85ffef, - 0xbf820094, 0xbef603ff, - 0x01000000, 0xbefc0384, - 0xbf0a7b7c, 0xbf840065, - 0xb97af803, 0x8a7a7aff, - 0x10000000, 0xbf850050, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, + 0xb97b1e06, 0x877bc17b, + 0xbf840027, 0x8f7b837b, + 0x807b7c7b, 0xbefe03c1, + 0xbeff0380, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf850017, 0x7e008700, 0xbe840380, 0xd7600000, 0x00000900, 0x80048104, 0xd7600001, 0x00000900, @@ -1078,78 +1131,25 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xf469003a, 0xe0000000, 0x80709070, 0xbf06c004, 0xbf84ffef, - 0xbe840380, 0xd7600000, - 0x00000901, 0x80048104, - 0xd7600001, 0x00000901, - 0x80048104, 0xd7600002, - 0x00000901, 0x80048104, - 0xd7600003, 0x00000901, - 0x80048104, 0xf469003a, - 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0xbe840380, 0xd7600000, - 0x00000902, 0x80048104, - 0xd7600001, 0x00000902, - 0x80048104, 0xd7600002, - 0x00000902, 0x80048104, - 0xd7600003, 0x00000902, - 0x80048104, 0xf469003a, - 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0xbe840380, 0xd7600000, - 0x00000903, 0x80048104, - 0xd7600001, 0x00000903, - 0x80048104, 0xd7600002, - 0x00000903, 0x80048104, - 0xd7600003, 0x00000903, - 0x80048104, 0xf469003a, - 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf82003b, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, - 0xe0704000, 0x705d0000, - 0xe0704100, 0x705d0100, - 0xe0704200, 0x705d0200, - 0xe0704300, 0x705d0300, - 0x807c847c, 0x8070ff70, - 0x00000400, 0xbf0a7b7c, - 0xbf85ffef, 0xb97b1e06, - 0x877bc17b, 0xbf840027, - 0x8f7b837b, 0x807b7c7b, - 0xbefe03c1, 0xbeff0380, - 0xb97af803, 0x8a7a7aff, - 0x10000000, 0xbf850017, - 0x7e008700, 0xbe840380, - 0xd7600000, 0x00000900, - 0x80048104, 0xd7600001, - 0x00000900, 0x80048104, - 0xd7600002, 0x00000900, - 0x80048104, 0xd7600003, - 0x00000900, 0x80048104, - 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06c004, - 0xbf84ffef, 0x807c817c, - 0xbf0a7b7c, 0xbf85ffea, - 0xbf820008, 0x7e008700, - 0xe0704000, 0x705d0000, - 0x807c817c, 0x8070ff70, - 0x00000080, 0xbf0a7b7c, - 0xbf85fff8, 0xbf820144, - 0xbef4037e, 0x8775ff7f, - 0x0000ffff, 0x8875ff75, - 0x00040000, 0xbef60380, - 0xbef703ff, 0x10807fac, - 0xb97202dc, 0x8f729972, - 0x876eff7f, 0x04000000, - 0xbf840034, 0xbefe03c1, - 0x907c9972, 0x877c817c, - 0xbf06817c, 0xbf850002, - 0xbeff0380, 0xbf820001, - 0xbeff03c1, 0xb96f4306, - 0x876fc16f, 0xbf840029, - 0x8f6f866f, 0x8f6f826f, + 0x807c817c, 0xbf0a7b7c, + 0xbf85ffea, 0xbf820008, + 0x7e008700, 0xe0704000, + 0x705d0000, 0x807c817c, + 0x8070ff70, 0x00000080, + 0xbf0a7b7c, 0xbf85fff8, + 0xbf82013f, 0xbef4037e, + 0x8775ff7f, 0x0000ffff, + 0x8875ff75, 0x00040000, + 0xbef60380, 0xbef703ff, + 0x10807fac, 0xb97202dc, + 0x8f729972, 0x876eff7f, + 0x04000000, 0xbf840033, + 0xbefe03c1, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f4306, 0x876fc16f, + 0xbf840028, 0x8f6f886f, 0xbef6036f, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, @@ -1185,7 +1185,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x785d0000, 0xe0304080, 0x785d0100, 0xe0304100, 0x785d0200, 0xe0304180, - 0x785d0300, 0xbf8c3f70, + 0x785d0300, 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, @@ -1194,7 +1194,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x6e5d0000, 0xe0304080, 0x6e5d0100, 0xe0304100, 0x6e5d0200, 0xe0304180, - 0x6e5d0300, 0xbf8c3f70, + 0x6e5d0300, 0xbf8c0000, 0xbf820034, 0xbef603ff, 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000400, @@ -1203,7 +1203,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x785d0000, 0xe0304100, 0x785d0100, 0xe0304200, 0x785d0200, 0xe0304300, - 0x785d0300, 0xbf8c3f70, + 0x785d0300, 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, @@ -1213,7 +1213,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x8f6f836f, 0x806f7c6f, 0xbefe03c1, 0xbeff0380, 0xe0304000, 0x785d0000, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x807c817c, 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, 0xbeff03c1, @@ -1221,7 +1221,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xe0304100, 0x6e5d0100, 0xe0304200, 0x6e5d0200, 0xe0304300, 0x6e5d0300, - 0xbf8c3f70, 0xb9783a05, + 0xbf8c0000, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, 0x8f788a78, @@ -1232,16 +1232,16 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x01000000, 0xbefc03ff, 0x0000006c, 0x80f89078, 0xf429003a, 0xf0000000, - 0xbf8cc07f, 0x80fc847c, + 0xbf8c0000, 0x80fc847c, 0xbf800000, 0xbe803100, 0xbe823102, 0x80f8a078, 0xf42d003a, 0xf0000000, - 0xbf8cc07f, 0x80fc887c, + 0xbf8c0000, 0x80fc887c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0x80f8c078, 0xf431003a, 0xf0000000, - 0xbf8cc07f, 0x80fc907c, + 0xbf8c0000, 0x80fc907c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0xbe883108, @@ -1271,15 +1271,13 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xf4211cfa, 0xf0000000, 0x80788478, 0xf4211bba, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef814, + 0xbf8c0000, 0xb9eef814, 0xf4211bba, 0xf0000000, - 0x80788478, 0xbf8cc07f, + 0x80788478, 0xbf8c0000, 0xb9eef815, 0xbefc036f, 0xbefe0370, 0xbeff0371, - 0x876f7bff, 0x000003ff, - 0xb9ef4803, 0xb9f9f816, - 0x876f7bff, 0xfffff800, - 0x906f8b6f, 0xb9efa2c3, + 0xb9f9f816, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -1291,7 +1289,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x0000ffff, 0xf4091c37, 0xfa000050, 0xf4091d37, 0xfa000060, 0xf4011e77, - 0xfa000074, 0xbf8cc07f, + 0xfa000074, 0xbf8c0000, 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, @@ -1305,7 +1303,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { }; static const uint32_t cwsr_trap_arcturus_hex[] = { - 0xbf820001, 0xbf8202d4, + 0xbf820001, 0xbf8202d5, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -1422,99 +1420,37 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fb1605, - 0x807b817b, 0x8e7b847b, - 0x8e76827b, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747074, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a7b7c, - 0xbf85ffe7, 0xbef40172, - 0xbef00080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf85004d, - 0xbe840080, 0xd2890000, - 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, - 0x80048104, 0xd2890002, - 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, - 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, - 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, - 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, - 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, - 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbf820008, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8fb4306, - 0x867bc17b, 0xbf840064, - 0xbf8a0000, 0x867aff6f, - 0x04000000, 0xbf840060, - 0x8e7b867b, 0x8e7b827b, - 0xbef6007b, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, + 0xbf85004d, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -1533,31 +1469,50 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, 0xbefe00c1, 0xbeff00c1, - 0xb8fb2a05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -1577,215 +1532,259 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850051, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xbefc0080, - 0xbf11017c, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850059, - 0xd3d84000, 0x18000100, - 0xd3d84001, 0x18000101, - 0xd3d84002, 0x18000102, - 0xd3d84003, 0x18000103, 0xbe840080, 0xd2890000, - 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, 0x80048104, 0xd2890002, - 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xbefc0080, 0xbf11017c, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf850059, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xbe840080, + 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, + 0x00000900, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffa9, 0xbf9c0000, - 0xbf820016, 0xd3d84000, - 0x18000100, 0xd3d84001, - 0x18000101, 0xd3d84002, - 0x18000102, 0xd3d84003, - 0x18000103, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffeb, - 0xbf9c0000, 0xbf8200e3, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x866eff7f, 0x04000000, - 0xbf84001f, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf84001a, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x8078ff78, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x781d0000, - 0xe0510100, 0x781d0000, - 0x807cff7c, 0x00000200, - 0x8078ff78, 0x00000200, - 0xbf0a6f7c, 0xbf85fff6, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffa9, + 0xbf9c0000, 0xbf820016, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffeb, 0xbf9c0000, + 0xbf8200e3, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x866eff7f, + 0x04000000, 0xbf84001f, 0xbefe00c1, 0xbeff00c1, + 0xb8ef4306, 0x866fc16f, + 0xbf84001a, 0x8e6f866f, + 0x8e6f826f, 0xbef6006f, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x8078ff78, 0x00000080, 0xbef600ff, 0x01000000, - 0xb8ef2a05, 0x806f816f, - 0x8e6f826f, 0x806fff6f, - 0x00008000, 0xbef80080, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefc0084, - 0xbf11087c, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffee, 0xbefc0080, - 0xbf11087c, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0xd3d94000, 0x18000100, - 0xd3d94001, 0x18000101, - 0xd3d94002, 0x18000102, - 0xd3d94003, 0x18000103, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffea, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xbf8c0f70, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbefe00c1, + 0xbeff00c1, 0xbef600ff, + 0x01000000, 0xb8ef2a05, + 0x806f816f, 0x8e6f826f, + 0x806fff6f, 0x00008000, + 0xbef80080, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe0070, 0xbeff0071, - 0x866f7bff, 0x000003ff, - 0xb96f4803, 0x866f7bff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2a05, 0x806e816e, - 0x8e6e8a6e, 0x8e6e816e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; static const uint32_t cwsr_trap_aldebaran_hex[] = { - 0xbf820001, 0xbf8202df, + 0xbf820001, 0xbf8202e0, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -1902,99 +1901,37 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fb1605, - 0x807b817b, 0x8e7b847b, - 0x8e76827b, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747074, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a7b7c, - 0xbf85ffe7, 0xbef40172, - 0xbef00080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf85004d, - 0xbe840080, 0xd2890000, - 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, - 0x80048104, 0xd2890002, - 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, - 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, - 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, - 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, - 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, - 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0xbf820008, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8fb4306, - 0x867bc17b, 0xbf840064, - 0xbf8a0000, 0x867aff6f, - 0x04000000, 0xbf840060, - 0x8e7b867b, 0x8e7b827b, - 0xbef6007b, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, + 0xbf85004d, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -2013,31 +1950,50 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, 0xbefe00c1, 0xbeff00c1, - 0xb8fb2b05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -2057,51 +2013,31 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, - 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, - 0x80048104, 0xc069003a, - 0x00000070, 0xbf8cc07f, - 0x80709070, 0xbf06c004, - 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, - 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, - 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, - 0xc069003a, 0x00000070, - 0xbf8cc07f, 0x80709070, - 0xbf06c004, 0xbf84ffee, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0x807c847c, - 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xb8fb2985, - 0x807b817b, 0x8e7b837b, - 0xb8fa2b05, 0x807a817a, - 0x8e7a827a, 0x80fb7a7b, - 0x867b7b7b, 0xbf84007a, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2b05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, 0x807bff7b, 0x00001000, - 0xbefc0080, 0xbf11017c, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850059, 0xd3d84000, - 0x18000100, 0xd3d84001, - 0x18000101, 0xd3d84002, - 0x18000102, 0xd3d84003, - 0x18000103, 0xbe840080, + 0xbf850051, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -2140,143 +2076,207 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0x807c847c, - 0xbf0a7b7c, 0xbf85ffa9, - 0xbf9c0000, 0xbf820016, - 0xd3d84000, 0x18000100, - 0xd3d84001, 0x18000101, - 0xd3d84002, 0x18000102, - 0xd3d84003, 0x18000103, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0x807c847c, 0x8070ff70, 0x00000400, 0xbf0a7b7c, - 0xbf85ffeb, 0xbf9c0000, - 0xbf8200ee, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x04000000, 0xbf84001f, + 0xbf85ffef, 0xbf9c0000, + 0xb8fb2985, 0x807b817b, + 0x8e7b837b, 0xb8fa2b05, + 0x807a817a, 0x8e7a827a, + 0x80fb7a7b, 0x867b7b7b, + 0xbf84007a, 0x807bff7b, + 0x00001000, 0xbefc0080, + 0xbf11017c, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf8200ee, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x04000000, + 0xbf84001f, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf84001a, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82985, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf84001a, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82985, 0x80788178, - 0x8e788a78, 0x8e788178, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbefe00c1, - 0xbeff00c1, 0xbef600ff, - 0x01000000, 0xb8ef2b05, - 0x806f816f, 0x8e6f826f, - 0x806fff6f, 0x00008000, - 0xbef80080, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xb8ef2985, 0x806f816f, - 0x8e6f836f, 0xb8f92b05, - 0x80798179, 0x8e798279, - 0x80ef796f, 0x866f6f6f, - 0xbf84001a, 0x806fff6f, - 0x00008000, 0xbefc0080, + 0xb8ef2b05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, 0xbf11087c, 0xe0524000, 0x781d0000, 0xe0524100, 0x781d0100, 0xe0524200, 0x781d0200, 0xe0524300, 0x781d0300, 0xbf8c0f70, - 0xd3d94000, 0x18000100, - 0xd3d94001, 0x18000101, - 0xd3d94002, 0x18000102, - 0xd3d94003, 0x18000103, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0x807c847c, 0x8078ff78, 0x00000400, 0xbf0a6f7c, - 0xbf85ffea, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xbf8c0f70, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbf85ffee, 0xb8ef2985, + 0x806f816f, 0x8e6f836f, + 0xb8f92b05, 0x80798179, + 0x8e798279, 0x80ef796f, + 0x866f6f6f, 0xbf84001a, + 0x806fff6f, 0x00008000, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe0070, 0xbeff0071, - 0x866f7bff, 0x000003ff, - 0xb96f4803, 0x866f7bff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2985, 0x806e816e, - 0x8e6e8a6e, 0x8e6e816e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2985, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx10_hex[] = { - 0xbf820001, 0xbf820221, + 0xbf820001, 0xbf820220, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -2302,19 +2302,19 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c0000, 0x8f6e976e, 0x8a77ff77, 0x00800000, 0x88776e77, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, + 0xfa000000, 0xbf8c0000, 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c0000, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff0000, - 0xbf850005, 0x8878ff78, - 0x00002000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x01000000, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff0000, + 0xbf850008, 0x876eff6d, + 0x01000000, 0xbf850007, + 0x8878ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x0000ffff, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, @@ -2322,7 +2322,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x0000ffff, 0xbefa0380, 0xb9fa0283, 0xbeee037e, 0xbeef037f, 0xbefe0480, - 0xbf900004, 0xbf8cc07f, + 0xbf900004, 0xbf8c0000, 0x877aff7f, 0x04000000, 0x8f7a857a, 0x886d7a6d, 0x7e008200, 0xbefa037e, @@ -2475,94 +2475,93 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840044, 0xbf8a0000, + 0xbf840043, 0xbf8a0000, 0x877aff6d, 0x80000000, - 0xbf840040, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, - 0x8070ff70, 0x00000080, - 0xbef603ff, 0x01000000, - 0xd7650000, 0x000100c1, - 0xd7660000, 0x000200c1, - 0x16000084, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850012, - 0xbe8303ff, 0x00000080, + 0xbf84003f, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef603ff, + 0x01000000, 0xd7650000, + 0x000100c1, 0xd7660000, + 0x000200c1, 0x16000084, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850012, 0xbe8303ff, + 0x00000080, 0xbf800000, 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000080, 0xbf0a7b7c, - 0xbf85fff4, 0xbf820011, - 0xbe8303ff, 0x00000100, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a7b7c, 0xbf85fff4, + 0xbf820011, 0xbe8303ff, + 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85fff4, 0xbefe03c1, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850004, - 0xbef003ff, 0x00000200, - 0xbeff0380, 0xbf820003, - 0xbef003ff, 0x00000400, - 0xbeff03c1, 0xb97b3a05, - 0x807b817b, 0x8f7b827b, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850017, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a7b7c, 0xbf85fff4, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x00000200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x00000400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850017, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf840037, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000200, 0xbf0a7b7c, + 0xbf85ffef, 0xbf820025, 0xbef603ff, 0x01000000, 0xbefc0384, 0xbf0a7b7c, - 0xbf840037, 0x7e008700, + 0xbf840011, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, 0x705d0300, 0x807c847c, - 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000400, 0xbf0a7b7c, 0xbf85ffef, - 0xbf820025, 0xbef603ff, - 0x01000000, 0xbefc0384, - 0xbf0a7b7c, 0xbf840011, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, + 0xb97b1e06, 0x877bc17b, + 0xbf84000c, 0x8f7b837b, + 0x807b7c7b, 0xbefe03c1, + 0xbeff0380, 0x7e008700, 0xe0704000, 0x705d0000, - 0xe0704100, 0x705d0100, - 0xe0704200, 0x705d0200, - 0xe0704300, 0x705d0300, - 0x807c847c, 0x8070ff70, - 0x00000400, 0xbf0a7b7c, - 0xbf85ffef, 0xb97b1e06, - 0x877bc17b, 0xbf84000c, - 0x8f7b837b, 0x807b7c7b, - 0xbefe03c1, 0xbeff0380, - 0x7e008700, 0xe0704000, - 0x705d0000, 0x807c817c, - 0x8070ff70, 0x00000080, - 0xbf0a7b7c, 0xbf85fff8, - 0xbf82013b, 0xbef4037e, - 0x8775ff7f, 0x0000ffff, - 0x8875ff75, 0x00040000, - 0xbef60380, 0xbef703ff, - 0x10807fac, 0xb97202dc, - 0x8f729972, 0x876eff7f, - 0x04000000, 0xbf840034, - 0xbefe03c1, 0x907c9972, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0xbeff0380, - 0xbf820001, 0xbeff03c1, - 0xb96f4306, 0x876fc16f, - 0xbf840029, 0x8f6f866f, - 0x8f6f826f, 0xbef6036f, + 0x807c817c, 0x8070ff70, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff8, 0xbf820136, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x10807fac, + 0xb97202dc, 0x8f729972, + 0x876eff7f, 0x04000000, + 0xbf840033, 0xbefe03c1, + 0x907c9972, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb96f4306, + 0x876fc16f, 0xbf840028, + 0x8f6f886f, 0xbef6036f, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, @@ -2598,7 +2597,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304080, 0x785d0100, 0xe0304100, 0x785d0200, 0xe0304180, 0x785d0300, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, 0x00000200, @@ -2607,7 +2606,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304080, 0x6e5d0100, 0xe0304100, 0x6e5d0200, 0xe0304180, 0x6e5d0300, - 0xbf8c3f70, 0xbf820034, + 0xbf8c0000, 0xbf820034, 0xbef603ff, 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000400, 0xbefc0384, @@ -2616,7 +2615,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304100, 0x785d0100, 0xe0304200, 0x785d0200, 0xe0304300, 0x785d0300, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, 0x00000400, @@ -2625,7 +2624,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf84000e, 0x8f6f836f, 0x806f7c6f, 0xbefe03c1, 0xbeff0380, 0xe0304000, - 0x785d0000, 0xbf8c3f70, + 0x785d0000, 0xbf8c0000, 0x7e008500, 0x807c817c, 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, @@ -2633,7 +2632,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x6e5d0000, 0xe0304100, 0x6e5d0100, 0xe0304200, 0x6e5d0200, 0xe0304300, - 0x6e5d0300, 0xbf8c3f70, + 0x6e5d0300, 0xbf8c0000, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, @@ -2644,16 +2643,16 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbef603ff, 0x01000000, 0xbefc03ff, 0x0000006c, 0x80f89078, 0xf429003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc847c, 0xbf800000, 0xbe803100, 0xbe823102, 0x80f8a078, 0xf42d003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc887c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0x80f8c078, 0xf431003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc907c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, @@ -2683,15 +2682,13 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x80788478, 0xf4211cfa, 0xf0000000, 0x80788478, 0xf4211bba, 0xf0000000, - 0x80788478, 0xbf8cc07f, + 0x80788478, 0xbf8c0000, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef815, + 0xbf8c0000, 0xb9eef815, 0xbefc036f, 0xbefe0370, - 0xbeff0371, 0x876f7bff, - 0x000003ff, 0xb9ef4803, - 0x876f7bff, 0xfffff800, - 0x906f8b6f, 0xb9efa2c3, + 0xbeff0371, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2703,7 +2700,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x0000ffff, 0xf4091c37, 0xfa000050, 0xf4091d37, 0xfa000060, 0xf4011e77, - 0xfa000074, 0xbf8cc07f, + 0xfa000074, 0xbf8c0000, 0x876dff6d, 0x0000ffff, 0x87fe7e7e, 0x87ea6a6a, 0xb9faf802, 0xbe80226c, @@ -2713,7 +2710,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { }; static const uint32_t cwsr_trap_gfx11_hex[] = { - 0xbfa00001, 0xbfa00225, + 0xbfa00001, 0xbfa00227, 0xb0804006, 0xb8f8f802, 0x9178ff78, 0x00020006, 0xb8fbf803, 0xbf0d9e6d, @@ -2737,187 +2734,188 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x8b6eff6e, 0x00000800, 0xbfa20003, 0x8b6eff7b, 0x00000400, 0xbfa2002a, - 0xbefa4d82, 0xbf89fc07, + 0xbefa4d82, 0xbf890000, 0x84fa887a, 0xbf0d8f7b, 0xbfa10002, 0x8c7bff7b, 0xffff0000, 0xf4005bbd, - 0xf8000010, 0xbf89fc07, + 0xf8000010, 0xbf890000, 0x846e976e, 0x9177ff77, 0x00800000, 0x8c776e77, 0xf4045bbd, 0xf8000000, - 0xbf89fc07, 0xf4045ebd, - 0xf8000008, 0xbf89fc07, + 0xbf890000, 0xf4045ebd, + 0xf8000008, 0xbf890000, 0x8bee6e6e, 0xbfa10001, 0xbe80486e, 0x8b6eff6d, - 0x01ff0000, 0xbfa20005, - 0x8c78ff78, 0x00002000, - 0x80ec886c, 0x82ed806d, - 0xbfa00005, 0x8b6eff6d, - 0x01000000, 0xbfa20002, + 0x00ff0000, 0xbfa20008, + 0x8b6eff6d, 0x01000000, + 0xbfa20007, 0x8c78ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xbfa00002, 0x806c846c, 0x826d806d, 0x8b6dff6d, 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, 0xb978f802, 0xbe804a6c, - 0x8b6dff6d, 0x0000ffff, - 0xbefa0080, 0xb97a0283, - 0xbeee007e, 0xbeef007f, - 0xbefe0180, 0xbefe4d84, - 0xbf89fc07, 0x8b7aff7f, - 0x04000000, 0x847a857a, - 0x8c6d7a6d, 0xbefa007e, - 0x8b7bff7f, 0x0000ffff, - 0xbefe00c1, 0xbeff00c1, - 0xdca6c000, 0x007a0000, - 0x7e000280, 0xbefe007a, - 0xbeff007b, 0xb8fb02dc, - 0x847b997b, 0xb8fa3b05, - 0x807a817a, 0xbf0d997b, - 0xbfa20002, 0x847a897a, - 0xbfa00001, 0x847a8a7a, - 0xb8fb1e06, 0x847b8a7b, - 0x807a7b7a, 0x8b7bff7f, - 0x0000ffff, 0x807aff7a, - 0x00000200, 0x807a7e7a, - 0x827b807b, 0xd7610000, - 0x00010870, 0xd7610000, - 0x00010a71, 0xd7610000, - 0x00010c72, 0xd7610000, - 0x00010e73, 0xd7610000, - 0x00011074, 0xd7610000, - 0x00011275, 0xd7610000, - 0x00011476, 0xd7610000, - 0x00011677, 0xd7610000, - 0x00011a79, 0xd7610000, - 0x00011c7e, 0xd7610000, - 0x00011e7f, 0xbefe00ff, - 0x00003fff, 0xbeff0080, - 0xdca6c040, 0x007a0000, - 0xd760007a, 0x00011d00, - 0xd760007b, 0x00011f00, + 0xbf0d9878, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x0000ffff, 0xbefa0080, + 0xb97a0283, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbefe4d84, 0xbf890000, + 0x8b7aff7f, 0x04000000, + 0x847a857a, 0x8c6d7a6d, + 0xbefa007e, 0x8b7bff7f, + 0x0000ffff, 0xbefe00c1, + 0xbeff00c1, 0xdca6c000, + 0x007a0000, 0x7e000280, 0xbefe007a, 0xbeff007b, - 0xbef4007e, 0x8b75ff7f, - 0x0000ffff, 0x8c75ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x10807fac, - 0xbef1007d, 0xbef00080, - 0xb8f302dc, 0x84739973, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20002, 0xbeff0080, - 0xbfa00002, 0xbeff00c1, - 0xbfa00009, 0xbef600ff, - 0x01000000, 0xe0685080, - 0x701d0100, 0xe0685100, - 0x701d0200, 0xe0685180, - 0x701d0300, 0xbfa00008, + 0xb8fb02dc, 0x847b997b, + 0xb8fa3b05, 0x807a817a, + 0xbf0d997b, 0xbfa20002, + 0x847a897a, 0xbfa00001, + 0x847a8a7a, 0xb8fb1e06, + 0x847b8a7b, 0x807a7b7a, + 0x8b7bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xd7610000, 0x00010870, + 0xd7610000, 0x00010a71, + 0xd7610000, 0x00010c72, + 0xd7610000, 0x00010e73, + 0xd7610000, 0x00011074, + 0xd7610000, 0x00011275, + 0xd7610000, 0x00011476, + 0xd7610000, 0x00011677, + 0xd7610000, 0x00011a79, + 0xd7610000, 0x00011c7e, + 0xd7610000, 0x00011e7f, + 0xbefe00ff, 0x00003fff, + 0xbeff0080, 0xdca6c040, + 0x007a0000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007d, + 0xbef00080, 0xb8f302dc, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa00009, 0xbef600ff, 0x01000000, - 0xe0685100, 0x701d0100, - 0xe0685200, 0x701d0200, - 0xe0685300, 0x701d0300, + 0xe0685080, 0x701d0100, + 0xe0685100, 0x701d0200, + 0xe0685180, 0x701d0300, + 0xbfa00008, 0xbef600ff, + 0x01000000, 0xe0685100, + 0x701d0100, 0xe0685200, + 0x701d0200, 0xe0685300, + 0x701d0300, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef600ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbefd0080, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xd7610002, 0x0000fa6c, + 0x807d817d, 0x917aff6d, + 0x80000000, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xd7610002, 0x0000fa6e, + 0x807d817d, 0xd7610002, + 0x0000fa6f, 0x807d817d, + 0xd7610002, 0x0000fa78, + 0x807d817d, 0xb8faf803, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xd7610002, + 0x0000fa7b, 0x807d817d, + 0xb8f1f801, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f814, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f815, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xbefe00ff, 0x0000ffff, + 0xbeff0080, 0xe0685000, + 0x701d0200, 0xbefe00c1, 0xb8f03b05, 0x80708170, 0xbf0d9973, 0xbfa20002, 0x84708970, 0xbfa00001, 0x84708a70, 0xb8fa1e06, 0x847a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, 0xbef600ff, 0x01000000, - 0x7e000280, 0x7e020280, - 0x7e040280, 0xbefd0080, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xd7610002, - 0x0000fa6c, 0x807d817d, - 0x917aff6d, 0x80000000, - 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xd7610002, - 0x0000fa6e, 0x807d817d, - 0xd7610002, 0x0000fa6f, - 0x807d817d, 0xd7610002, - 0x0000fa78, 0x807d817d, - 0xb8faf803, 0xd7610002, - 0x0000fa7a, 0x807d817d, - 0xd7610002, 0x0000fa7b, - 0x807d817d, 0xb8f1f801, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xb8f1f814, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xb8f1f815, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xbefe00ff, - 0x0000ffff, 0xbeff0080, - 0xe0685000, 0x701d0200, - 0xbefe00c1, 0xb8f03b05, - 0x80708170, 0xbf0d9973, - 0xbfa20002, 0x84708970, - 0xbfa00001, 0x84708a70, - 0xb8fa1e06, 0x847a8a7a, - 0x80707a70, 0xbef600ff, - 0x01000000, 0xbef90080, - 0xbefd0080, 0xbf800000, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xbe8c410c, 0xbe8e410e, - 0xd7610002, 0x0000f200, + 0xbef90080, 0xbefd0080, + 0xbf800000, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xbe8c410c, + 0xbe8e410e, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xd7610002, 0x0000f20c, + 0x0000f20c, 0x80798179, + 0xd7610002, 0x0000f20d, 0x80798179, 0xd7610002, - 0x0000f20d, 0x80798179, - 0xd7610002, 0x0000f20e, + 0x0000f20e, 0x80798179, + 0xd7610002, 0x0000f20f, + 0x80798179, 0xbf06a079, + 0xbfa10006, 0xe0685000, + 0x701d0200, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ffbc, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, 0x80798179, 0xd7610002, - 0x0000f20f, 0x80798179, - 0xbf06a079, 0xbfa10006, - 0xe0685000, 0x701d0200, - 0x8070ff70, 0x00000080, - 0xbef90080, 0x7e040280, - 0x807d907d, 0xbf0aff7d, - 0x00000060, 0xbfa2ffbc, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xd7610002, 0x0000f200, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, - 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, - 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xe0685000, 0x701d0200, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20002, 0xbeff0080, - 0xbfa00001, 0xbeff00c1, - 0xb8fb4306, 0x8b7bc17b, - 0xbfa10044, 0xbfbd0000, - 0x8b7aff6d, 0x80000000, - 0xbfa10040, 0x847b867b, - 0x847b827b, 0xbef6007b, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, + 0x80798179, 0xe0685000, + 0x701d0200, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8fb4306, + 0x8b7bc17b, 0xbfa10043, + 0xbfbd0000, 0x8b7aff6d, + 0x80000000, 0xbfa1003f, + 0x847b887b, 0xbef6007b, 0xb8f03b05, 0x80708170, 0xbf0d9973, 0xbfa20002, 0x84708970, 0xbfa00001, @@ -2988,177 +2986,175 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x701d0000, 0x807d817d, 0x8070ff70, 0x00000080, 0xbf0a7b7d, 0xbfa2fff8, - 0xbfa00146, 0xbef4007e, + 0xbfa00143, 0xbef4007e, 0x8b75ff7f, 0x0000ffff, 0x8c75ff75, 0x00040000, 0xbef60080, 0xbef700ff, 0x10807fac, 0xb8f202dc, 0x84729972, 0x8b6eff7f, - 0x04000000, 0xbfa1003a, + 0x04000000, 0xbfa10039, 0xbefe00c1, 0x857d9972, 0x8b7d817d, 0xbf06817d, 0xbfa20002, 0xbeff0080, 0xbfa00001, 0xbeff00c1, 0xb8ef4306, 0x8b6fc16f, - 0xbfa1002f, 0x846f866f, - 0x846f826f, 0xbef6006f, - 0xb8f83b05, 0x80788178, - 0xbf0d9972, 0xbfa20002, - 0x84788978, 0xbfa00001, - 0x84788a78, 0xb8ee1e06, - 0x846e8a6e, 0x80786e78, + 0xbfa1002e, 0x846f886f, + 0xbef6006f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa2000c, + 0xe0500000, 0x781d0000, + 0xbf890000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000080, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff5, 0xbfa0000b, + 0xe0500000, 0x781d0000, + 0xbf890000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2fff5, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20024, 0xbef600ff, + 0x01000000, 0xbeee0078, 0x8078ff78, 0x00000200, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbefd0080, - 0xbfa2000c, 0xe0500000, - 0x781d0000, 0xbf8903f7, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000080, - 0x8078ff78, 0x00000080, - 0xbf0a6f7d, 0xbfa2fff5, - 0xbfa0000b, 0xe0500000, - 0x781d0000, 0xbf8903f7, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000100, - 0x8078ff78, 0x00000100, - 0xbf0a6f7d, 0xbfa2fff5, - 0xbef80080, 0xbefe00c1, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8ef3b05, - 0x806f816f, 0x846f826f, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20024, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000200, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10050, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10050, 0xe0505000, + 0x781d0000, 0xe0505080, + 0x781d0100, 0xe0505100, + 0x781d0200, 0xe0505180, + 0x781d0300, 0xbf890000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffee, 0xe0505000, + 0x6e1d0000, 0xe0505080, + 0x6e1d0100, 0xe0505100, + 0x6e1d0200, 0xe0505180, + 0x6e1d0300, 0xbf890000, + 0xbfa00034, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10012, 0xe0505000, + 0x781d0000, 0xe0505100, + 0x781d0100, 0xe0505200, + 0x781d0200, 0xe0505300, + 0x781d0300, 0xbf890000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffee, 0xb8ef1e06, + 0x8b6fc16f, 0xbfa1000e, + 0x846f836f, 0x806f7d6f, + 0xbefe00c1, 0xbeff0080, 0xe0505000, 0x781d0000, - 0xe0505080, 0x781d0100, - 0xe0505100, 0x781d0200, - 0xe0505180, 0x781d0300, - 0xbf8903f7, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000200, - 0xbf0a6f7d, 0xbfa2ffee, + 0xbf890000, 0x7e008500, + 0x807d817d, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff7, 0xbeff00c1, 0xe0505000, 0x6e1d0000, - 0xe0505080, 0x6e1d0100, - 0xe0505100, 0x6e1d0200, - 0xe0505180, 0x6e1d0300, - 0xbf8903f7, 0xbfa00034, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10012, - 0xe0505000, 0x781d0000, - 0xe0505100, 0x781d0100, - 0xe0505200, 0x781d0200, - 0xe0505300, 0x781d0300, - 0xbf8903f7, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000400, - 0xbf0a6f7d, 0xbfa2ffee, - 0xb8ef1e06, 0x8b6fc16f, - 0xbfa1000e, 0x846f836f, - 0x806f7d6f, 0xbefe00c1, - 0xbeff0080, 0xe0505000, - 0x781d0000, 0xbf8903f7, - 0x7e008500, 0x807d817d, - 0x8078ff78, 0x00000080, - 0xbf0a6f7d, 0xbfa2fff7, - 0xbeff00c1, 0xe0505000, - 0x6e1d0000, 0xe0505100, - 0x6e1d0100, 0xe0505200, - 0x6e1d0200, 0xe0505300, - 0x6e1d0300, 0xbf8903f7, + 0xe0505100, 0x6e1d0100, + 0xe0505200, 0x6e1d0200, + 0xe0505300, 0x6e1d0300, + 0xbf890000, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x80f8ff78, + 0x00000050, 0xbef600ff, + 0x01000000, 0xbefd00ff, + 0x0000006c, 0x80f89078, + 0xf428403a, 0xf0000000, + 0xbf890000, 0x80fd847d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0x80f8a078, + 0xf42c403a, 0xf0000000, + 0xbf890000, 0x80fd887d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0x80f8c078, + 0xf430403a, 0xf0000000, + 0xbf890000, 0x80fd907d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0xbe884308, + 0xbe8a430a, 0xbe8c430c, + 0xbe8e430e, 0xbf06807d, + 0xbfa1fff0, 0xb980f801, + 0x00000000, 0xbfbd0000, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, 0x84788a78, 0xb8ee1e06, 0x846e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, - 0x80f8ff78, 0x00000050, 0xbef600ff, 0x01000000, - 0xbefd00ff, 0x0000006c, - 0x80f89078, 0xf428403a, - 0xf0000000, 0xbf89fc07, - 0x80fd847d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0x80f8a078, 0xf42c403a, - 0xf0000000, 0xbf89fc07, - 0x80fd887d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0xbe844304, 0xbe864306, - 0x80f8c078, 0xf430403a, - 0xf0000000, 0xbf89fc07, - 0x80fd907d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0xbe844304, 0xbe864306, - 0xbe884308, 0xbe8a430a, - 0xbe8c430c, 0xbe8e430e, - 0xbf06807d, 0xbfa1fff0, - 0xb980f801, 0x00000000, - 0xbfbd0000, 0xb8f83b05, - 0x80788178, 0xbf0d9972, - 0xbfa20002, 0x84788978, - 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0xbef600ff, - 0x01000000, 0xf4205bfa, + 0xf4205bfa, 0xf0000000, + 0x80788478, 0xf4205b3a, 0xf0000000, 0x80788478, - 0xf4205b3a, 0xf0000000, - 0x80788478, 0xf4205b7a, + 0xf4205b7a, 0xf0000000, + 0x80788478, 0xf4205c3a, 0xf0000000, 0x80788478, - 0xf4205c3a, 0xf0000000, - 0x80788478, 0xf4205c7a, + 0xf4205c7a, 0xf0000000, + 0x80788478, 0xf4205eba, 0xf0000000, 0x80788478, - 0xf4205eba, 0xf0000000, - 0x80788478, 0xf4205efa, + 0xf4205efa, 0xf0000000, + 0x80788478, 0xf4205e7a, 0xf0000000, 0x80788478, - 0xf4205e7a, 0xf0000000, - 0x80788478, 0xf4205cfa, + 0xf4205cfa, 0xf0000000, + 0x80788478, 0xf4205bba, 0xf0000000, 0x80788478, + 0xbf890000, 0xb96ef814, 0xf4205bba, 0xf0000000, - 0x80788478, 0xbf89fc07, - 0xb96ef814, 0xf4205bba, - 0xf0000000, 0x80788478, - 0xbf89fc07, 0xb96ef815, - 0xbefd006f, 0xbefe0070, - 0xbeff0071, 0x8b6f7bff, - 0x000003ff, 0xb96f4803, - 0x8b6f7bff, 0xfffff800, - 0x856f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee3b05, - 0x806e816e, 0xbf0d9972, - 0xbfa20002, 0x846e896e, - 0xbfa00001, 0x846e8a6e, - 0xb8ef1e06, 0x846f8a6f, - 0x806e6f6e, 0x806eff6e, - 0x00000200, 0x806e746e, - 0x826f8075, 0x8b6fff6f, - 0x0000ffff, 0xf4085c37, - 0xf8000050, 0xf4085d37, - 0xf8000060, 0xf4005e77, - 0xf8000074, 0xbf89fc07, - 0x8b6dff6d, 0x0000ffff, - 0x8bfe7e7e, 0x8bea6a6a, - 0xb8eef802, 0xbf0d866e, - 0xbfa20002, 0xb97af802, - 0xbe80486c, 0xb97af802, - 0xbe804a6c, 0xbfb10000, + 0x80788478, 0xbf890000, + 0xb96ef815, 0xbefd006f, + 0xbefe0070, 0xbeff0071, + 0xb97b4803, 0x857b8b7b, + 0xb97b22c3, 0x857b867b, + 0xb97b7443, 0xb973f801, + 0xb8ee3b05, 0x806e816e, + 0xbf0d9972, 0xbfa20002, + 0x846e896e, 0xbfa00001, + 0x846e8a6e, 0xb8ef1e06, + 0x846f8a6f, 0x806e6f6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x8b6fff6f, 0x0000ffff, + 0xf4085c37, 0xf8000050, + 0xf4085d37, 0xf8000060, + 0xf4005e77, 0xf8000074, + 0xbf890000, 0x8b6dff6d, + 0x0000ffff, 0x8bfe7e7e, + 0x8bea6a6a, 0xb8eef802, + 0xbf0d866e, 0xbfa20002, + 0xb97af802, 0xbe80486c, + 0xb97af802, 0xbe804a6c, + 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { - 0xbf820001, 0xbf8202db, + 0xbf820001, 0xbf8202dc, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, @@ -3273,99 +3269,143 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0xbefe007c, 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, 0x80708470, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbeef0080, - 0x876f6f7a, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fb1605, - 0x807b817b, 0x8e7b847b, - 0x8e76827b, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747074, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a7b7c, - 0xbf85ffe7, 0xbef40172, - 0xbef00080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0x867aff78, - 0x00400000, 0xbf850003, - 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf85004d, - 0xbe840080, 0xd2890000, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf85004d, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, 0x00000900, 0x80048104, - 0xd2890001, 0x00000900, - 0x80048104, 0xd2890002, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, 0x00000900, 0x80048104, - 0xd2890003, 0x00000900, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000901, + 0xd2890000, 0x00000902, 0x80048104, 0xd2890001, - 0x00000901, 0x80048104, - 0xd2890002, 0x00000901, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, 0x80048104, 0xd2890003, - 0x00000901, 0x80048104, + 0x00000902, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, 0xd2890000, - 0x00000902, 0x80048104, - 0xd2890001, 0x00000902, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, 0x80048104, 0xd2890002, - 0x00000902, 0x80048104, - 0xd2890003, 0x00000902, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0xbe840080, - 0xd2890000, 0x00000903, + 0xd2890000, 0x00000901, 0x80048104, 0xd2890001, - 0x00000903, 0x80048104, - 0xd2890002, 0x00000903, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, 0x80048104, 0xd2890003, - 0x00000903, 0x80048104, + 0x00000901, 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, - 0xbf820008, 0xe0724000, - 0x701d0000, 0xe0724100, - 0x701d0100, 0xe0724200, - 0x701d0200, 0xe0724300, - 0x701d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8fb4306, - 0x867bc17b, 0xbf840064, - 0xbf8a0000, 0x867aff6f, - 0x04000000, 0xbf840060, - 0x8e7b867b, 0x8e7b827b, - 0xbef6007b, 0xb8f02985, - 0x80708170, 0x8e708a70, - 0x8e708170, 0xb8fa1605, - 0x807a817a, 0x8e7a867a, - 0x80707a70, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2b05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, + 0x807bff7b, 0x00001000, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850030, 0x24040682, - 0xd86e4000, 0x00000002, - 0xbf8cc07f, 0xbe840080, + 0xbf850051, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -3384,31 +3424,51 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0x80048104, 0xc069003a, 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, - 0xbf84ffee, 0x680404ff, - 0x00000200, 0xd0c9006a, - 0x0000f702, 0xbf87ffd2, - 0xbf820015, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x701d0002, 0x68040702, - 0xd0c9006a, 0x0000f702, - 0xbf87fff7, 0xbef70000, - 0xbef000ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8fb2b05, 0x807b817b, - 0x8e7b827b, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a7b7c, 0xbf84006d, - 0xbf11017c, 0x807bff7b, - 0x00001000, 0x867aff78, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x807c847c, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xbf9c0000, + 0xb8fb2985, 0x807b817b, + 0x8e7b837b, 0xb8fa2b05, + 0x807a817a, 0x8e7a827a, + 0x80fb7a7b, 0x867b7b7b, + 0xbf84007a, 0x807bff7b, + 0x00001000, 0xbefc0080, + 0xbf11017c, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, - 0x10000000, 0xbf850051, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, @@ -3448,31 +3508,913 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf9c0000, - 0xbf820012, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0x807c847c, 0x8070ff70, 0x00000400, - 0xbf0a7b7c, 0xbf85ffef, - 0xbf9c0000, 0xb8fb2985, - 0x807b817b, 0x8e7b837b, - 0xb8fa2b05, 0x807a817a, - 0x8e7a827a, 0x80fb7a7b, - 0x867b7b7b, 0xbf84007a, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf8200ee, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x04000000, + 0xbf84001f, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf84001a, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82985, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, + 0xbefe00c1, 0xbeff00c1, + 0xbef600ff, 0x01000000, + 0xb8ef2b05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xb8ef2985, + 0x806f816f, 0x8e6f836f, + 0xb8f92b05, 0x80798179, + 0x8e798279, 0x80ef796f, + 0x866f6f6f, 0xbf84001a, + 0x806fff6f, 0x00008000, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, + 0x00000078, 0x80788478, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, + 0x00000078, 0x80788478, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, + 0x00000078, 0x80788478, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, + 0x00000078, 0x80788478, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, + 0x00000078, 0x80788478, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2985, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b79, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, +}; + +static const uint32_t cwsr_trap_gfx12_hex[] = { + 0xbfa00001, 0xbfa002a2, + 0xb0804009, 0xb8f8f804, + 0x9178ff78, 0x00008c00, + 0xb8fbf811, 0x8b6eff78, + 0x00004000, 0xbfa10008, + 0x8b6eff7b, 0x00000080, + 0xbfa20018, 0x8b6ea07b, + 0xbfa20042, 0xbf830010, + 0xb8fbf811, 0xbfa0fffb, + 0x8b6eff7b, 0x00000bd0, + 0xbfa20010, 0xb8eef812, + 0x8b6f8f7b, 0xbfa10002, + 0x8c6eff6e, 0x00000080, + 0xb8eff813, 0x8b6e6e6f, + 0xbfa20008, 0x8b6eff6d, + 0xf0000000, 0xbfa20005, + 0x8b6fff6f, 0x00000200, + 0xbfa20002, 0x8b6ea07b, + 0xbfa2002c, 0xbefa4d82, + 0xbf8a0000, 0x84fa887a, + 0xbf0d8f7b, 0xbfa10002, + 0x8c7bff7b, 0xffff0000, + 0xf4601bbd, 0xf8000010, + 0xbf8a0000, 0x846e976e, + 0x9177ff77, 0x00800000, + 0x8c776e77, 0xf4603bbd, + 0xf8000000, 0xbf8a0000, + 0xf4603ebd, 0xf8000008, + 0xbf8a0000, 0x8bee6e6e, + 0xbfa10001, 0xbe80486e, + 0x8b6eff6d, 0xf0000000, + 0xbfa20009, 0xb8eef811, + 0x8b6eff6e, 0x00000080, + 0xbfa20007, 0x8c78ff78, + 0x00004000, 0x80ec886c, + 0x82ed806d, 0xbfa00002, + 0x806c846c, 0x826d806d, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0x85788978, 0xb9783244, + 0xbe804a6c, 0xb8faf802, + 0xbf0d987a, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x0000ffff, 0xbefa0080, + 0xb97a0151, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbefe4d84, 0xbf8a0000, + 0x8b7aff7f, 0x04000000, + 0x847a857a, 0x8c6d7a6d, + 0xbefa007e, 0x8b7bff7f, + 0x0000ffff, 0xbefe00c1, + 0xbeff00c1, 0xee0a407a, + 0x000c0000, 0x00000000, + 0x7e000280, 0xbefe007a, + 0xbeff007b, 0xb8fb0742, + 0x847b997b, 0xb8fa3b05, + 0x807a817a, 0xbf0d997b, + 0xbfa20002, 0x847a897a, + 0xbfa00001, 0x847a8a7a, + 0xb8fb1e06, 0x847b8a7b, + 0x807a7b7a, 0x8b7bff7f, + 0x0000ffff, 0x807aff7a, + 0x00000200, 0x807a7e7a, + 0x827b807b, 0xd7610000, + 0x00010870, 0xd7610000, + 0x00010a71, 0xd7610000, + 0x00010c72, 0xd7610000, + 0x00010e73, 0xd7610000, + 0x00011074, 0xd7610000, + 0x00011275, 0xd7610000, + 0x00011476, 0xd7610000, + 0x00011677, 0xd7610000, + 0x00011a79, 0xd7610000, + 0x00011c7e, 0xd7610000, + 0x00011e7f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xbefe00ff, + 0x00003fff, 0xbeff0080, + 0xee0a407a, 0x000c0000, + 0x00004000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007d, + 0xbef00080, 0xb8f30742, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa0000c, + 0xbef600ff, 0x01000000, + 0xc4068070, 0x008ce801, + 0x00008000, 0xc4068070, + 0x008ce802, 0x00010000, + 0xc4068070, 0x008ce803, + 0x00018000, 0xbfa0000b, + 0xbef600ff, 0x01000000, + 0xc4068070, 0x008ce801, + 0x00010000, 0xc4068070, + 0x008ce802, 0x00020000, + 0xc4068070, 0x008ce803, + 0x00030000, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef600ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbe804ec2, 0xbf94fffe, + 0xb8faf804, 0x8b7a847a, + 0x91788478, 0x8c787a78, + 0x917aff6d, 0x80000000, + 0xd7610002, 0x00010071, + 0xd7610002, 0x0001026c, + 0xd7610002, 0x0001047a, + 0xd7610002, 0x0001066e, + 0xd7610002, 0x0001086f, + 0xd7610002, 0x00010a78, + 0xd7610002, 0x00010e7b, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xb8faf811, 0xd7610002, + 0x00010c7a, 0xb8faf801, + 0xd7610002, 0x0001107a, + 0xb8faf814, 0xd7610002, + 0x0001127a, 0xb8faf815, + 0xd7610002, 0x0001147a, + 0xb8faf812, 0xd7610002, + 0x0001167a, 0xb8faf813, + 0xd7610002, 0x0001187a, + 0xb8faf802, 0xd7610002, + 0x00011a7a, 0xbefa50c1, + 0xbfc70000, 0xd7610002, + 0x00011c7a, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xbefe00ff, + 0x0000ffff, 0xbeff0080, + 0xc4068070, 0x008ce802, + 0x00000000, 0xbefe00c1, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xb8fa1e06, + 0x847a8a7a, 0x80707a70, + 0xbef600ff, 0x01000000, + 0xbef90080, 0xbefd0080, + 0xbf800000, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xbe8c410c, + 0xbe8e410e, 0xbf068079, + 0xbfa10032, 0xd7610002, + 0x00010000, 0xd7610002, + 0x00010201, 0xd7610002, + 0x00010402, 0xd7610002, + 0x00010603, 0xd7610002, + 0x00010804, 0xd7610002, + 0x00010a05, 0xd7610002, + 0x00010c06, 0xd7610002, + 0x00010e07, 0xd7610002, + 0x00011008, 0xd7610002, + 0x00011209, 0xd7610002, + 0x0001140a, 0xd7610002, + 0x0001160b, 0xd7610002, + 0x0001180c, 0xd7610002, + 0x00011a0d, 0xd7610002, + 0x00011c0e, 0xd7610002, + 0x00011e0f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0x80799079, + 0xbfa00038, 0xd7610002, + 0x00012000, 0xd7610002, + 0x00012201, 0xd7610002, + 0x00012402, 0xd7610002, + 0x00012603, 0xd7610002, + 0x00012804, 0xd7610002, + 0x00012a05, 0xd7610002, + 0x00012c06, 0xd7610002, + 0x00012e07, 0xd7610002, + 0x00013008, 0xd7610002, + 0x00013209, 0xd7610002, + 0x0001340a, 0xd7610002, + 0x0001360b, 0xd7610002, + 0x0001380c, 0xd7610002, + 0x00013a0d, 0xd7610002, + 0x00013c0e, 0xd7610002, + 0x00013e0f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0x80799079, + 0xc4068070, 0x008ce802, + 0x00000000, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ff88, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x00010000, 0xd7610002, + 0x00010201, 0xd7610002, + 0x00010402, 0xd7610002, + 0x00010603, 0xd7610002, + 0x00010804, 0xd7610002, + 0x00010a05, 0xd7610002, + 0x00010c06, 0xd7610002, + 0x00010e07, 0xd7610002, + 0x00011008, 0xd7610002, + 0x00011209, 0xd7610002, + 0x0001140a, 0xd7610002, + 0x0001160b, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xc4068070, + 0x008ce802, 0x00000000, + 0xbefe00c1, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8fb4306, 0x8b7bc17b, + 0xbfa10044, 0x8b7aff6d, + 0x80000000, 0xbfa10041, + 0x847b897b, 0xbef6007b, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xb8fa1e06, + 0x847a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xd71f0000, 0x000100c1, + 0xd7200000, 0x000200c1, + 0x16000084, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa20013, + 0xbe8300ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8a0000, + 0xc4068070, 0x008ce801, + 0x00000000, 0x807d037d, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a7b7d, 0xbfa2fff3, + 0xbfa00012, 0xbe8300ff, + 0x00000100, 0xbf800000, + 0xbf800000, 0xbf800000, + 0xd8d80000, 0x01000000, + 0xbf8a0000, 0xc4068070, + 0x008ce801, 0x00000000, + 0x807d037d, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a7b7d, + 0xbfa2fff3, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20004, + 0xbef000ff, 0x00000200, + 0xbeff0080, 0xbfa00003, + 0xbef000ff, 0x00000400, + 0xbeff00c1, 0xb8fb3b05, + 0x807b817b, 0x847b827b, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa2001b, + 0xbef600ff, 0x01000000, + 0xbefd0084, 0xbf0a7b7d, + 0xbfa10040, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xc4068070, + 0x008ce800, 0x00000000, + 0xc4068070, 0x008ce801, + 0x00008000, 0xc4068070, + 0x008ce802, 0x00010000, + 0xc4068070, 0x008ce803, + 0x00018000, 0x807d847d, + 0x8070ff70, 0x00000200, + 0xbf0a7b7d, 0xbfa2ffeb, + 0xbfa0002a, 0xbef600ff, + 0x01000000, 0xbefd0084, + 0xbf0a7b7d, 0xbfa10015, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xc4068070, 0x008ce800, + 0x00000000, 0xc4068070, + 0x008ce801, 0x00010000, + 0xc4068070, 0x008ce802, + 0x00020000, 0xc4068070, + 0x008ce803, 0x00030000, + 0x807d847d, 0x8070ff70, + 0x00000400, 0xbf0a7b7d, + 0xbfa2ffeb, 0xb8fb1e06, + 0x8b7bc17b, 0xbfa1000d, + 0x847b837b, 0x807b7d7b, + 0xbefe00c1, 0xbeff0080, + 0x7e008700, 0xc4068070, + 0x008ce800, 0x00000000, + 0x807d817d, 0x8070ff70, + 0x00000080, 0xbf0a7b7d, + 0xbfa2fff7, 0xbfa0016e, + 0xbef4007e, 0x8b75ff7f, + 0x0000ffff, 0x8c75ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x10807fac, + 0xbef1007f, 0xb8f20742, + 0x84729972, 0x8b6eff7f, + 0x04000000, 0xbfa1003b, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef4306, 0x8b6fc16f, + 0xbfa10030, 0x846f896f, + 0xbef6006f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa2000d, + 0xc4050078, 0x0080e800, + 0x00000000, 0xbf8a0000, + 0xdac00000, 0x00000000, + 0x807dff7d, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7d, 0xbfa2fff4, + 0xbfa0000c, 0xc4050078, + 0x0080e800, 0x00000000, + 0xbf8a0000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2fff4, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa2002c, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000200, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10061, 0xc4050078, + 0x008ce800, 0x00000000, + 0xc4050078, 0x008ce801, + 0x00008000, 0xc4050078, + 0x008ce802, 0x00010000, + 0xc4050078, 0x008ce803, + 0x00018000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffea, 0xc405006e, + 0x008ce800, 0x00000000, + 0xc405006e, 0x008ce801, + 0x00008000, 0xc405006e, + 0x008ce802, 0x00010000, + 0xc405006e, 0x008ce803, + 0x00018000, 0xbf8a0000, + 0xbfa0003d, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10016, 0xc4050078, + 0x008ce800, 0x00000000, + 0xc4050078, 0x008ce801, + 0x00010000, 0xc4050078, + 0x008ce802, 0x00020000, + 0xc4050078, 0x008ce803, + 0x00030000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffea, 0xb8ef1e06, + 0x8b6fc16f, 0xbfa1000f, + 0x846f836f, 0x806f7d6f, + 0xbefe00c1, 0xbeff0080, + 0xc4050078, 0x008ce800, + 0x00000000, 0xbf8a0000, + 0x7e008500, 0x807d817d, + 0x8078ff78, 0x00000080, + 0xbf0a6f7d, 0xbfa2fff6, + 0xbeff00c1, 0xc405006e, + 0x008ce800, 0x00000000, + 0xc405006e, 0x008ce801, + 0x00010000, 0xc405006e, + 0x008ce802, 0x00020000, + 0xc405006e, 0x008ce803, + 0x00030000, 0xbf8a0000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0xb8ee1e06, + 0x846e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0x80f8ff78, 0x00000050, + 0xbef600ff, 0x01000000, + 0xbefd00ff, 0x0000006c, + 0x80f89078, 0xf462403a, + 0xf0000000, 0xbf8a0000, + 0x80fd847d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0x80f8a078, 0xf462603a, + 0xf0000000, 0xbf8a0000, + 0x80fd887d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0x80f8c078, 0xf462803a, + 0xf0000000, 0xbf8a0000, + 0x80fd907d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0xbe884308, 0xbe8a430a, + 0xbe8c430c, 0xbe8e430e, + 0xbf06807d, 0xbfa1fff0, + 0xb980f801, 0x00000000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0xb8ee1e06, + 0x846e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0xbef600ff, 0x01000000, + 0xbeff0071, 0xf4621bfa, + 0xf0000000, 0x80788478, + 0xf4621b3a, 0xf0000000, + 0x80788478, 0xf4621b7a, + 0xf0000000, 0x80788478, + 0xf4621c3a, 0xf0000000, + 0x80788478, 0xf4621c7a, + 0xf0000000, 0x80788478, + 0xf4621eba, 0xf0000000, + 0x80788478, 0xf4621efa, + 0xf0000000, 0x80788478, + 0xf4621e7a, 0xf0000000, + 0x80788478, 0xf4621cfa, + 0xf0000000, 0x80788478, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xb96ef814, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xb96ef815, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xb96ef812, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xb96ef813, + 0x8b6eff7f, 0x04000000, + 0xbfa1000d, 0x80788478, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xbf0d806e, 0xbfa10006, + 0x856e906e, 0x8b6e6e6e, + 0xbfa10003, 0xbe804ec1, + 0x816ec16e, 0xbfa0fffb, + 0xbefd006f, 0xbefe0070, + 0xbeff0071, 0xb97b2011, + 0x857b867b, 0xb97b0191, + 0x857b827b, 0xb97bba11, + 0xb973f801, 0xb8ee3b05, + 0x806e816e, 0xbf0d9972, + 0xbfa20002, 0x846e896e, + 0xbfa00001, 0x846e8a6e, + 0xb8ef1e06, 0x846f8a6f, + 0x806e6f6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x8b6fff6f, + 0x0000ffff, 0xf4605c37, + 0xf8000050, 0xf4605d37, + 0xf8000060, 0xf4601e77, + 0xf8000074, 0xbf8a0000, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0xb97af804, 0xbe804ec2, + 0xbf94fffe, 0xbe804a6c, + 0xbe804ec2, 0xbf94fffe, + 0xbfb10000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, +}; + +static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { + 0xbf820001, 0xbf8202ca, + 0xb8f8f802, 0x8978ff78, + 0x00020006, 0xb8fbf803, + 0x866eff78, 0x00002000, + 0xbf840009, 0x866eff6d, + 0x00ff0000, 0xbf85001a, + 0x866eff7b, 0x00000400, + 0xbf850051, 0xbf8e0010, + 0xb8fbf803, 0xbf82fffa, + 0x866eff7b, 0x03c00900, + 0xbf850011, 0x866eff7b, + 0x000071ff, 0xbf840008, + 0x866fff7b, 0x00007080, + 0xbf840001, 0xbeee1a87, + 0xb8eff801, 0x8e6e8c6e, + 0x866e6f6e, 0xbf850006, + 0x866eff6d, 0x00ff0000, + 0xbf850003, 0x866eff7b, + 0x00000400, 0xbf85003a, + 0xb8faf807, 0x867aff7a, + 0x001f8000, 0x8e7a8b7a, + 0x8979ff79, 0xfc000000, + 0x87797a79, 0xba7ff807, + 0x00000000, 0xb8faf812, + 0xb8fbf813, 0x8efa887a, + 0xbf0d8f7b, 0xbf840002, + 0x877bff7b, 0xffff0000, + 0xc0031bbd, 0x00000010, + 0xbf8cc07f, 0x8e6e976e, + 0x8979ff79, 0x00800000, + 0x87796e79, 0xc0071bbd, + 0x00000000, 0xbf8cc07f, + 0xc0071ebd, 0x00000008, + 0xbf8cc07f, 0x86ee6e6e, + 0xbf840001, 0xbe801d6e, + 0x866eff6d, 0x01ff0000, + 0xbf850005, 0x8778ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xbf820005, + 0x866eff6d, 0x01000000, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x866dff6d, + 0x0000ffff, 0x8f7a8b79, + 0x867aff7a, 0x001f8000, + 0xb97af807, 0x86fe7e7e, + 0x86ea6a6a, 0x8f6e8378, + 0xb96ee0c2, 0xbf800002, + 0xb9780002, 0xbe801f6c, + 0x866dff6d, 0x0000ffff, + 0xbefa0080, 0xb97a0283, + 0xb8faf807, 0x867aff7a, + 0x001f8000, 0x8e7a8b7a, + 0x8979ff79, 0xfc000000, + 0x87797a79, 0xba7ff807, + 0x00000000, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbf900004, 0x877a8478, + 0xb97af802, 0xbf8e0002, + 0xbf88fffe, 0xb8fa2985, + 0x807a817a, 0x8e7a8a7a, + 0x8e7a817a, 0xb8fb1605, + 0x807b817b, 0x8e7b867b, + 0x807a7b7a, 0x807a7e7a, + 0x827b807f, 0x867bff7b, + 0x0000ffff, 0xc04b1c3d, + 0x00000050, 0xbf8cc07f, + 0xc04b1d3d, 0x00000060, + 0xbf8cc07f, 0xc0431e7d, + 0x00000074, 0xbf8cc07f, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0xbef1007c, 0xbef00080, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xbefe007c, + 0xbefc0070, 0xc0611c7a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611b3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611b7a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611bba, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611bfa, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611e3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xb8fbf803, + 0xbefe007c, 0xbefc0070, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611a3a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611a7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xb8f1f801, + 0xbefe007c, 0xbefc0070, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbf108080, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf85004d, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8fb5306, 0x867bc17b, + 0xbf840052, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf84004e, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02985, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf85001d, + 0x24040682, 0xd86c0000, + 0x00000002, 0xbf8cc07f, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0x680404ff, + 0x00000100, 0xd0c9006a, + 0x0000f702, 0xbf87ffe5, + 0xbf820016, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe800077, + 0x8677ff77, 0xff7fffff, + 0x8777ff77, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8cc07f, 0xe0765000, + 0x701d0002, 0x68040702, + 0xd0c9006a, 0x0000f702, + 0xbefe016a, 0xbf87fff6, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2b05, + 0x807b817b, 0x8e7b827b, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a7b7c, + 0xbf84006d, 0xbf11017c, 0x807bff7b, 0x00001000, - 0xbefc0080, 0xbf11017c, 0x867aff78, 0x00400000, 0xbf850003, 0xb8faf803, 0x897a7aff, 0x10000000, - 0xbf850059, 0xd3d84000, - 0x18000100, 0xd3d84001, - 0x18000101, 0xd3d84002, - 0x18000102, 0xd3d84003, - 0x18000103, 0xbe840080, + 0xbf850051, 0xbe840080, 0xd2890000, 0x00000900, 0x80048104, 0xd2890001, 0x00000900, 0x80048104, @@ -3511,137 +4453,204 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0x00000070, 0xbf8cc07f, 0x80709070, 0xbf06c004, 0xbf84ffee, 0x807c847c, - 0xbf0a7b7c, 0xbf85ffa9, - 0xbf9c0000, 0xbf820016, - 0xd3d84000, 0x18000100, - 0xd3d84001, 0x18000101, - 0xd3d84002, 0x18000102, - 0xd3d84003, 0x18000103, + 0xbf0a7b7c, 0xbf85ffb1, + 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0xe0724000, 0x701d0000, 0xe0724100, 0x701d0100, 0xe0724200, 0x701d0200, 0xe0724300, 0x701d0300, 0x807c847c, 0x8070ff70, 0x00000400, 0xbf0a7b7c, - 0xbf85ffeb, 0xbf9c0000, - 0xbf8200ee, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x04000000, 0xbf84001f, + 0xbf85ffef, 0xbf9c0000, + 0xb8fb2985, 0x807b817b, + 0x8e7b837b, 0xb8fa2b05, + 0x807a817a, 0x8e7a827a, + 0x80fb7a7b, 0x867b7b7b, + 0xbf84007a, 0x807bff7b, + 0x00001000, 0xbefc0080, + 0xbf11017c, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf8200f4, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x04000000, + 0xbf840025, 0xbefe00c1, + 0xbeff00c1, 0xb8ef5306, + 0x866fc16f, 0xbf840020, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82985, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0xe0510200, 0x781d0000, + 0xe0510300, 0x781d0000, + 0xe0510400, 0x781d0000, + 0x807cff7c, 0x00000500, + 0x8078ff78, 0x00000500, + 0xbf0a6f7c, 0xbf85fff0, 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf84001a, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82985, 0x80788178, - 0x8e788a78, 0x8e788178, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbefe00c1, - 0xbeff00c1, 0xbef600ff, - 0x01000000, 0xb8ef2b05, - 0x806f816f, 0x8e6f826f, - 0x806fff6f, 0x00008000, - 0xbef80080, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xb8ef2985, 0x806f816f, - 0x8e6f836f, 0xb8f92b05, - 0x80798179, 0x8e798279, - 0x80ef796f, 0x866f6f6f, - 0xbf84001a, 0x806fff6f, - 0x00008000, 0xbefc0080, + 0xb8ef2b05, 0x806f816f, + 0x8e6f826f, 0x806fff6f, + 0x00008000, 0xbef80080, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, 0xbf11087c, 0xe0524000, 0x781d0000, 0xe0524100, 0x781d0100, 0xe0524200, 0x781d0200, 0xe0524300, 0x781d0300, 0xbf8c0f70, - 0xd3d94000, 0x18000100, - 0xd3d94001, 0x18000101, - 0xd3d94002, 0x18000102, - 0xd3d94003, 0x18000103, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, 0x807c847c, 0x8078ff78, 0x00000400, 0xbf0a6f7c, - 0xbf85ffea, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xbf8c0f70, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82985, - 0x80788178, 0x8e788a78, - 0x8e788178, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbf85ffee, 0xb8ef2985, + 0x806f816f, 0x8e6f836f, + 0xb8f92b05, 0x80798179, + 0x8e798279, 0x80ef796f, + 0x866f6f6f, 0xbf84001a, + 0x806fff6f, 0x00008000, + 0xbefc0080, 0xbf11087c, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffea, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xbf8c0f70, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82985, 0x80788178, + 0x8e788a78, 0x8e788178, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe0070, 0xbeff0071, - 0x866f7bff, 0x000003ff, - 0xb96f4803, 0x866f7bff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2985, 0x806e816e, - 0x8e6e8a6e, 0x8e6e816e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc00b1c37, 0x00000050, - 0xc00b1d37, 0x00000060, - 0xc0031e77, 0x00000074, - 0xbf8cc07f, 0x8f6e8b79, - 0x866eff6e, 0x001f8000, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e837a, - 0xb96ee0c2, 0xbf800002, - 0xb97a0002, 0xbf8a0000, - 0xbe801f6c, 0xbf9b0000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2985, + 0x806e816e, 0x8e6e8a6e, + 0x8e6e816e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc00b1c37, + 0x00000050, 0xc00b1d37, + 0x00000060, 0xc0031e77, + 0x00000074, 0xbf8cc07f, + 0x8f6e8b79, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0xbe801f6c, + 0xbf9b0000, 0x00000000, }; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index e1aaa5ce0784..96fbb16ceb21 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -33,6 +33,7 @@ * gfx11: * cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3 * sp3 gfx11.sp3 -hex gfx11.hex + * */ #define CHIP_NAVI10 26 @@ -43,23 +44,33 @@ #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID) #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) -#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO) +#define SW_SA_TRAP (ASIC_FAMILY == CHIP_PLUM_BONITO) #define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger +#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised -var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#define S_COHERENCE glc:1 +#define V_COHERENCE slc:1 glc:1 +#define S_WAITCNT_0 s_waitcnt 0 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_HALT_MASK = 0x2000 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 +var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 +var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 +var S_STATUS_HWREG = HW_REG_STATUS +var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK +var S_STATUS_HALT_MASK = SQ_WAVE_STATUS_HALT_MASK +var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 +var S_SAVE_PC_HI_HT_MASK = 0x01000000 +var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 -var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 -var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 #if ASIC_FAMILY < CHIP_PLUM_BONITO var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 @@ -74,16 +85,13 @@ var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000 #if ASIC_FAMILY >= CHIP_PLUM_BONITO +var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT = 16 var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000 +var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT = 17 var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000 #endif @@ -99,15 +107,27 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 +var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT +var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT + #if ASIC_FAMILY < CHIP_PLUM_BONITO var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0 +var S_TRAPSTS_RESTORE_PART_3_SIZE = 0 #else var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK |\ SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\ SQ_WAVE_TRAPSTS_WAVE_START_MASK |\ SQ_WAVE_TRAPSTS_WAVE_END_MASK |\ SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT - SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_TRAPSTS_WAVE_START_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT #endif +var S_TRAPSTS_HWREG = HW_REG_TRAPSTS +var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_TRAPSTS_SAVECTX_MASK +var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT // bits [31:24] unused by SPI debug data var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31 @@ -121,8 +141,6 @@ var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC -var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 -var S_SAVE_PC_HI_HT_MASK = 0x01000000 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 @@ -182,6 +200,7 @@ var s_restore_buf_rsrc3 = ttmp11 var s_restore_size = ttmp6 var s_restore_ttmps_lo = s_restore_tmp var s_restore_ttmps_hi = s_restore_alloc_size +var s_restore_spi_init_hi_save = s_restore_exec_hi shader main asic(DEFAULT) @@ -194,13 +213,13 @@ L_JUMP_TO_RESTORE: s_branch L_RESTORE L_SKIP_RESTORE: - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_getreg_b32 s_save_status, hwreg(S_STATUS_HWREG) //save STATUS since we will change SCC // Clear SPI_PRIO: do not save with elevated priority. // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK + s_andn2_b32 s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) #if SW_SA_TRAP // If ttmp1[30] is set then issue s_barrier to unblock dependent waves. @@ -215,7 +234,7 @@ L_TRAP_NO_BARRIER: s_cbranch_scc1 L_CHECK_SAVE #endif - s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + s_and_b32 ttmp2, s_save_status, S_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_HALTED L_HALTED: @@ -224,14 +243,14 @@ L_HALTED: s_cbranch_scc1 L_FETCH_2ND_TRAP L_CHECK_SAVE: - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK s_cbranch_scc1 L_SAVE // Wave is halted but neither host trap nor SAVECTX is raised. // Caused by instruction fetch memory violation. // Spin wait until context saved to prevent interrupt storm. s_sleep 0x10 - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) s_branch L_CHECK_SAVE L_NOT_HALTED: @@ -265,15 +284,15 @@ L_CHECK_TRAP_ID: s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK s_cbranch_scc1 L_FETCH_2ND_TRAP -if SINGLE_STEP_MISSED_WORKAROUND +#if SINGLE_STEP_MISSED_WORKAROUND // Prioritize single step exception over context save. // Second-level trap will halt wave and RFE, re-entering for SAVECTX. s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK s_cbranch_scc1 L_FETCH_2ND_TRAP -end +#endif - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK s_cbranch_scc1 L_SAVE L_FETCH_2ND_TRAP: @@ -286,7 +305,7 @@ L_FETCH_2ND_TRAP: // ttmp12 holds SQ_WAVE_STATUS #if HAVE_SENDMSG_RTN s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 #else s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) @@ -298,16 +317,16 @@ L_FETCH_2ND_TRAP: s_or_b32 ttmp15, ttmp15, 0xFFFF0000 L_NO_SIGN_EXTEND_TMA: - s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag - s_waitcnt lgkmcnt(0) + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag + S_WAITCNT_0 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK s_or_b32 ttmp11, ttmp11, ttmp2 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA - s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA - s_waitcnt lgkmcnt(0) + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA + S_WAITCNT_0 + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA + S_WAITCNT_0 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set @@ -315,9 +334,13 @@ L_NO_SIGN_EXTEND_TMA: L_NO_NEXT_TRAP: // If not caused by trap then halt wave to prevent re-entry. - s_and_b32 ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK) + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK s_cbranch_scc1 L_TRAP_CASE - s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Host trap will not cause trap re-entry. + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK + s_cbranch_scc1 L_EXIT_TRAP + s_or_b32 s_save_status, s_save_status, S_STATUS_HALT_MASK // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. // Rewind the PC to prevent this from occurring. @@ -327,10 +350,6 @@ L_NO_NEXT_TRAP: s_branch L_EXIT_TRAP L_TRAP_CASE: - // Host trap will not cause trap re-entry. - s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK - s_cbranch_scc1 L_EXIT_TRAP - // Advance past trap instruction to prevent re-entry. s_add_u32 ttmp0, ttmp0, 0x4 s_addc_u32 ttmp1, ttmp1, 0x0 @@ -345,14 +364,22 @@ L_EXIT_TRAP: // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status + s_setreg_b32 hwreg(S_STATUS_HWREG), s_save_status s_rfe_b64 [ttmp0, ttmp1] L_SAVE: + // If VGPRs have been deallocated then terminate the wavefront. + // It has no remaining program to run and cannot save without VGPRs. +#if ASIC_FAMILY == CHIP_PLUM_BONITO + s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: +#endif s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] s_mov_b32 s_save_tmp, 0 - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit #if HAVE_XNACK save_and_clear_ib_sts(s_save_tmp, s_save_trapsts) @@ -377,7 +404,7 @@ L_SLEEP: s_sleep 0x2 s_cbranch_execz L_SLEEP #else - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 #endif // Save first_wave flag so we can clear high bits of save address. @@ -399,7 +426,7 @@ L_SLEEP: s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0xFFFFFFFF - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE v_mov_b32 v0, 0x0 s_mov_b32 exec_lo, s_save_ttmps_lo s_mov_b32 exec_hi, s_save_ttmps_hi @@ -407,7 +434,7 @@ L_SLEEP: // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 - get_wave_size(s_save_ttmps_hi) + get_wave_size2(s_save_ttmps_hi) get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) get_svgpr_size_bytes(s_save_ttmps_hi) s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi @@ -431,15 +458,15 @@ L_SLEEP: s_mov_b32 exec_lo, 0x3FFF s_mov_b32 exec_hi, 0x0 - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE v_readlane_b32 ttmp14, v0, 0xE v_readlane_b32 ttmp15, v0, 0xF s_mov_b32 exec_lo, ttmp14 s_mov_b32 exec_hi, ttmp15 #else - s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE #endif /* setup Resource Contants */ @@ -453,7 +480,7 @@ L_SLEEP: /* global mem offset */ s_mov_b32 s_save_mem_offset, 0x0 - get_wave_size(s_wave_size) + get_wave_size2(s_wave_size) #if HAVE_XNACK // Save and clear vector XNACK state late to free up SGPRs. @@ -488,11 +515,11 @@ L_SAVE_FIRST_VGPRS32_WITH_TCP: #endif #if !NO_SQC_STORE - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #endif - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 s_branch L_SAVE_HWREG L_SAVE_4VGPR_WAVE64: @@ -511,11 +538,11 @@ L_SAVE_FIRST_VGPRS64_WITH_TCP: #endif #if !NO_SQC_STORE - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #endif - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 /* save HW registers */ @@ -543,7 +570,7 @@ L_SAVE_HWREG: write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) - s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_tmp, hwreg(S_TRAPSTS_HWREG) write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) // Not used on Sienna_Cichlid but keep layout same for debugger. @@ -562,7 +589,7 @@ L_SAVE_HWREG: // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. s_mov_b32 exec_lo, 0xFFFF s_mov_b32 exec_hi, 0x0 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. s_mov_b32 exec_lo, 0xFFFFFFFF @@ -605,7 +632,7 @@ L_SAVE_SGPR_LOOP: s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 s_mov_b32 ttmp13, 0x0 v_mov_b32 v2, 0x0 @@ -626,7 +653,7 @@ L_SAVE_SGPR_SKIP_TCP_STORE: write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) #if NO_SQC_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #else // restore s_save_buf_rsrc0,1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask @@ -656,8 +683,7 @@ L_SAVE_LDS_NORMAL: // first wave do LDS save; - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) @@ -688,7 +714,7 @@ L_SAVE_LDS_W32: L_SAVE_LDS_LOOP_SQC_W32: ds_read_b32 v1, v0 - s_waitcnt 0 + S_WAITCNT_0 write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) @@ -708,8 +734,8 @@ L_SAVE_LDS_WITH_TCP_W32: s_nop 0 L_SAVE_LDS_LOOP_W32: ds_read_b32 v1, v0 - s_waitcnt 0 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + S_WAITCNT_0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -726,7 +752,7 @@ L_SAVE_LDS_W64: L_SAVE_LDS_LOOP_SQC_W64: ds_read_b32 v1, v0 - s_waitcnt 0 + S_WAITCNT_0 write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) @@ -746,8 +772,8 @@ L_SAVE_LDS_WITH_TCP_W64: s_nop 0 L_SAVE_LDS_LOOP_W64: ds_read_b32 v1, v0 - s_waitcnt 0 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + S_WAITCNT_0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -814,10 +840,10 @@ L_SAVE_VGPR_W32_LOOP: v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes @@ -859,10 +885,10 @@ L_SAVE_VGPR_W64_LOOP: v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes @@ -899,7 +925,7 @@ L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC: L_SAVE_SHARED_VGPR_WAVE64_LOOP: v_movrels_b32 v0, v0 //v0 = v[0+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 @@ -917,7 +943,7 @@ L_RESTORE: s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC //determine it is wave32 or wave64 - get_wave_size(s_restore_size) + get_wave_size2(s_restore_size) s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_RESTORE_VGPR @@ -937,8 +963,7 @@ L_RESTORE_LDS_NORMAL: s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) @@ -962,7 +987,7 @@ L_RESTORE_LDS_LOOP_W32: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW #else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset - s_waitcnt vmcnt(0) + S_WAITCNT_0 ds_store_addtid_b32 v0 #endif s_add_u32 m0, m0, 128 // 128 DW @@ -976,7 +1001,7 @@ L_RESTORE_LDS_LOOP_W64: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW #else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset - s_waitcnt vmcnt(0) + S_WAITCNT_0 ds_store_addtid_b32 v0 #endif s_add_u32 m0, m0, 256 // 256 DW @@ -1017,11 +1042,11 @@ L_RESTORE_VGPR_NORMAL: s_cbranch_scc0 L_RESTORE_SGPR L_RESTORE_VGPR_WAVE32_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3 + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 v_movreld_b32 v2, v2 @@ -1032,11 +1057,11 @@ L_RESTORE_VGPR_WAVE32_LOOP: s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? /* VGPR restore on v0 */ - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3 + S_WAITCNT_0 s_branch L_RESTORE_SGPR @@ -1051,11 +1076,11 @@ L_RESTORE_VGPR_WAVE64: s_cbranch_scc0 L_RESTORE_SHARED_VGPR L_RESTORE_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3 + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 v_movreld_b32 v2, v2 @@ -1077,8 +1102,8 @@ L_RESTORE_SHARED_VGPR: s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0x00000000 L_RESTORE_SHARED_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 @@ -1089,11 +1114,11 @@ L_RESTORE_SHARED_VGPR_WAVE64_LOOP: /* VGPR restore on v0 */ L_RESTORE_V0: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3 + S_WAITCNT_0 /* restore SGPRs */ //will be 2+8+16*6 @@ -1110,7 +1135,7 @@ L_RESTORE_SGPR: s_mov_b32 m0, s_sgpr_save_num read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1119,7 +1144,7 @@ L_RESTORE_SGPR: s_movreld_b64 s2, s2 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1131,7 +1156,7 @@ L_RESTORE_SGPR: L_RESTORE_SGPR_LOOP: read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1173,12 +1198,12 @@ L_RESTORE_HWREG: read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + S_WAITCNT_0 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch @@ -1186,16 +1211,21 @@ L_RESTORE_HWREG: s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - #if HAVE_XNACK s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask #endif - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + // {TRAPSTS/EXCP_FLAG_PRIV}.SAVE_CONTEXT and HOST_TRAP may have changed. + // Only restore the other fields to avoid clobbering them. + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, 0, S_TRAPSTS_RESTORE_PART_1_SIZE), s_restore_trapsts + s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_2_SHIFT, S_TRAPSTS_RESTORE_PART_2_SIZE), s_restore_trapsts + +if S_TRAPSTS_RESTORE_PART_3_SIZE > 0 + s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_3_SHIFT - S_TRAPSTS_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_3_SHIFT, S_TRAPSTS_RESTORE_PART_3_SIZE), s_restore_trapsts +end + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic @@ -1207,10 +1237,10 @@ L_RESTORE_HWREG: s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 - s_waitcnt lgkmcnt(0) + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE + S_WAITCNT_0 #if HAVE_XNACK restore_ib_sts(s_restore_tmp, s_restore_m0) @@ -1232,7 +1262,8 @@ L_RESTORE_HWREG: L_RETURN_WITHOUT_PRIV: #endif - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + s_setreg_b32 hwreg(S_STATUS_HWREG), s_restore_status // SCC is included, which is changed by previous salu + s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: @@ -1247,7 +1278,7 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) #else s_mov_b32 exec_lo, m0 s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo #endif @@ -1262,10 +1293,10 @@ function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) s_add_u32 ttmp13, ttmp13, 0x1 end #else - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE + s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE + s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE + s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 #endif @@ -1279,32 +1310,32 @@ function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) s_add_u32 ttmp13, ttmp13, 0x1 end #else - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE + s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE + s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 #endif end function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE s_add_u32 s_mem_offset, s_mem_offset, 4 end function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*16 - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE end function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*8 - s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE end function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*4 - s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE end #if SAVE_AFTER_XNACK_ERROR @@ -1345,11 +1376,6 @@ function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset) end #endif -function get_lds_size_bytes(s_lds_size_byte) - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -end - function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 @@ -1375,11 +1401,12 @@ function get_hwreg_size_bytes return 128 end -function get_wave_size(s_reg) +function get_wave_size2(s_reg) s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE end +#if HAVE_XNACK function save_and_clear_ib_sts(tmp1, tmp2) // Preserve and clear scalar XNACK state before issuing scalar loads. // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into @@ -1404,3 +1431,4 @@ function restore_ib_sts(tmp1, tmp2) s_or_b32 tmp1, tmp1, tmp2 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 end +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm new file mode 100644 index 000000000000..5a1a1b1f897f --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -0,0 +1,1136 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* To compile this assembly code: + * + * gfx12: + * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx12.asm -P -o gfx12.sp3 + * sp3 gfx12.sp3 -hex gfx12.hex + */ + +#define CHIP_GFX12 37 + +#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised +#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12) + +var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 +var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 +var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 +var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15 +var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 +var SQ_WAVE_STATUS_WAVE64_SIZE = 1 +var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 +var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK +var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 + +var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF +var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 +var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 +var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 +var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 +var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT = 7 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 +var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 +var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 +var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200 + +var SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK= SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT +var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT +var BARRIER_STATE_SIGNAL_OFFSET = 16 +var BARRIER_STATE_VALID_OFFSET = 0 + +var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 +var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 + +// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] +// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000 +var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31 + +var s_sgpr_save_num = 108 + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi +var s_save_pc_lo = ttmp0 +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_state_priv = ttmp12 +var s_save_excp_flag_priv = ttmp15 +var s_save_xnack_mask = s_save_excp_flag_priv +var s_wave_size = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 +var s_save_mem_offset = ttmp4 +var s_save_alloc_size = s_save_excp_flag_priv +var s_save_tmp = ttmp14 +var s_save_m0 = ttmp5 +var s_save_ttmps_lo = s_save_tmp +var s_save_ttmps_hi = s_save_excp_flag_priv + +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +var S_WAVE_SIZE = 25 + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp2 +var s_restore_mem_offset_save = s_restore_tmp +var s_restore_m0 = s_restore_alloc_size +var s_restore_mode = ttmp7 +var s_restore_flat_scratch = s_restore_tmp +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_state_priv = ttmp14 +var s_restore_excp_flag_priv = ttmp15 +var s_restore_xnack_mask = ttmp13 +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_size = ttmp6 +var s_restore_ttmps_lo = s_restore_tmp +var s_restore_ttmps_hi = s_restore_alloc_size +var s_restore_spi_init_hi_save = s_restore_exec_hi + +shader main + asic(DEFAULT) + type(CS) + wave_size(32) + + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE + +L_SKIP_RESTORE: + s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC + + // Clear SPI_PRIO: do not save with elevated priority. + // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. + s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK + + s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + + s_and_b32 ttmp2, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK + s_cbranch_scc0 L_NOT_HALTED + +L_HALTED: + // Host trap may occur while wave is halted. + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP + +L_CHECK_SAVE: + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK + s_cbranch_scc1 L_SAVE + + // Wave is halted but neither host trap nor SAVECTX is raised. + // Caused by instruction fetch memory violation. + // Spin wait until context saved to prevent interrupt storm. + s_sleep 0x10 + s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + s_branch L_CHECK_SAVE + +L_NOT_HALTED: + // Let second-level handle non-SAVECTX exception or trap. + // Any concurrent SAVECTX will be handled upon re-entry once halted. + + // Check non-maskable exceptions. memory_violation, illegal_instruction + // and xnack_error exceptions always cause the wave to enter the trap + // handler. + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_NON_MASKABLE_EXCP_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP + + // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. + // Maskable exceptions only cause the wave to enter the trap handler if + // their respective bit in mode.excp_en is set. + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + s_and_b32 ttmp3, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK + s_cbranch_scc0 L_NOT_ADDR_WATCH + s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK + +L_NOT_ADDR_WATCH: + s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL) + s_and_b32 ttmp2, ttmp3, ttmp2 + s_cbranch_scc1 L_FETCH_2ND_TRAP + +L_CHECK_TRAP_ID: + // Check trap_id != 0 + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP + +#if SINGLE_STEP_MISSED_WORKAROUND + // Prioritize single step exception over context save. + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. + // WAVE_TRAP_CTRL is already in ttmp3. + s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK + s_cbranch_scc1 L_FETCH_2ND_TRAP +#endif + + s_and_b32 ttmp2, s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK + s_cbranch_scc1 L_SAVE + +L_FETCH_2ND_TRAP: + // Read second-level TBA/TMA from first-level TMA and jump if available. + // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) + // ttmp12 holds SQ_WAVE_STATUS + s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) + s_wait_idle + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + + s_bitcmp1_b32 ttmp15, 0xF + s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA + s_or_b32 ttmp15, ttmp15, 0xFFFF0000 +L_NO_SIGN_EXTEND_TMA: + + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag + s_wait_idle + s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT + s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK + s_or_b32 ttmp11, ttmp11, ttmp2 + + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 scope:SCOPE_SYS // second-level TBA + s_wait_idle + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 scope:SCOPE_SYS // second-level TMA + s_wait_idle + + s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] + s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set + s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler + +L_NO_NEXT_TRAP: + // If not caused by trap then halt wave to prevent re-entry. + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK + s_cbranch_scc1 L_TRAP_CASE + + // Host trap will not cause trap re-entry. + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK + s_cbranch_scc1 L_EXIT_TRAP + s_or_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_HALT_MASK + + // If the PC points to S_ENDPGM then context save will fail if STATE_PRIV.HALT is set. + // Rewind the PC to prevent this from occurring. + s_sub_u32 ttmp0, ttmp0, 0x8 + s_subb_u32 ttmp1, ttmp1, 0x0 + + s_branch L_EXIT_TRAP + +L_TRAP_CASE: + // Advance past trap instruction to prevent re-entry. + s_add_u32 ttmp0, ttmp0, 0x4 + s_addc_u32 ttmp1, ttmp1, 0x0 + +L_EXIT_TRAP: + s_and_b32 ttmp1, ttmp1, 0xFFFF + + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + + // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. + // Only restore fields which the trap handler changes. + s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ + SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv + + s_rfe_b64 [ttmp0, ttmp1] + +L_SAVE: + // If VGPRs have been deallocated then terminate the wavefront. + // It has no remaining program to run and cannot save without VGPRs. + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: + + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_mov_b32 s_save_tmp, 0 + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + + s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE) + s_wait_idle + + // Save first_wave flag so we can clear high bits of save address. + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT) + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + + // Trap temporaries must be saved via VGPR but all VGPRs are in use. + // There is no ttmp space to hold the resource constant for VGPR save. + // Save v0 by itself since it requires only two SGPRs. + s_mov_b32 s_save_ttmps_lo, exec_lo + s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0xFFFFFFFF + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS + v_mov_b32 v0, 0x0 + s_mov_b32 exec_lo, s_save_ttmps_lo + s_mov_b32 exec_hi, s_save_ttmps_hi + + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 + get_wave_size2(s_save_ttmps_hi) + get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) + get_svgpr_size_bytes(s_save_ttmps_hi) + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi + s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 + + v_writelane_b32 v0, ttmp4, 0x4 + v_writelane_b32 v0, ttmp5, 0x5 + v_writelane_b32 v0, ttmp6, 0x6 + v_writelane_b32 v0, ttmp7, 0x7 + v_writelane_b32 v0, ttmp8, 0x8 + v_writelane_b32 v0, ttmp9, 0x9 + v_writelane_b32 v0, ttmp10, 0xA + v_writelane_b32 v0, ttmp11, 0xB + v_writelane_b32 v0, ttmp13, 0xD + v_writelane_b32 v0, exec_lo, 0xE + v_writelane_b32 v0, exec_hi, 0xF + valu_sgpr_hazard() + + s_mov_b32 exec_lo, 0x3FFF + s_mov_b32 exec_hi, 0x0 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS + v_readlane_b32 ttmp14, v0, 0xE + v_readlane_b32 ttmp15, v0, 0xF + s_mov_b32 exec_lo, ttmp14 + s_mov_b32 exec_hi, ttmp15 + + /* setup Resource Contants */ + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + + s_mov_b32 s_save_m0, m0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 + get_wave_size2(s_wave_size) + + /* save first 4 VGPRs, needed for SGPR save */ + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_4VGPR_WAVE32 +L_ENABLE_SAVE_4VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + s_branch L_SAVE_4VGPR_WAVE64 +L_SAVE_4VGPR_WAVE32: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3 + s_branch L_SAVE_HWREG + +L_SAVE_4VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3 + + /* save HW registers */ + +L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource + v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource + v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store + + // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. + s_barrier_signal -2 + s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp + + s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK + v_writelane_b32 v2, s_save_m0, 0x0 + v_writelane_b32 v2, s_save_pc_lo, 0x1 + v_writelane_b32 v2, s_save_tmp, 0x2 + v_writelane_b32 v2, s_save_exec_lo, 0x3 + v_writelane_b32 v2, s_save_exec_hi, 0x4 + v_writelane_b32 v2, s_save_state_priv, 0x5 + v_writelane_b32 v2, s_save_xnack_mask, 0x7 + valu_sgpr_hazard() + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + v_writelane_b32 v2, s_save_tmp, 0x6 + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE) + v_writelane_b32 v2, s_save_tmp, 0x8 + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) + v_writelane_b32 v2, s_save_tmp, 0x9 + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) + v_writelane_b32 v2, s_save_tmp, 0xA + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + v_writelane_b32 v2, s_save_tmp, 0xB + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_TRAP_CTRL) + v_writelane_b32 v2, s_save_tmp, 0xC + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + v_writelane_b32 v2, s_save_tmp, 0xD + + s_get_barrier_state s_save_tmp, -1 + s_wait_kmcnt (0) + v_writelane_b32 v2, s_save_tmp, 0xE + valu_sgpr_hazard() + + // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. + s_mov_b32 exec_lo, 0xFFFF + s_mov_b32 exec_hi, 0x0 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. + s_mov_b32 exec_lo, 0xFFFFFFFF + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + + // SGPR SR memory offset : size(VGPR)+size(SVGPR) + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states +L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + s_cmp_eq_u32 ttmp13, 0x0 + s_cbranch_scc0 L_WRITE_V2_SECOND_HALF + write_16sgpr_to_v2(s0, 0x0) + s_branch L_SAVE_SGPR_SKIP_TCP_STORE +L_WRITE_V2_SECOND_HALF: + write_16sgpr_to_v2(s0, 0x10) + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 + s_mov_b32 ttmp13, 0x0 + v_mov_b32 v2, 0x0 +L_SAVE_SGPR_SKIP_TCP_STORE: + + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? + + //save the rest 12 SGPR + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + write_12sgpr_to_v2(s0) + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + /* save LDS */ + +L_SAVE_LDS: + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_LDS_NORMAL +L_ENABLE_SAVE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_SAVE_LDS_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + //load 0~63*4(byte address) to vgpr v0 + v_mbcnt_lo_u32_b32 v0, -1, 0 + v_mbcnt_hi_u32_b32 v0, -1, v0 + v_mul_u32_u24 v0, 4, v0 + + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_SAVE_LDS_W64 + +L_SAVE_LDS_W32: + s_mov_b32 s3, 128 + s_nop 0 + s_nop 0 + s_nop 0 +L_SAVE_LDS_LOOP_W32: + ds_read_b32 v1, v0 + s_wait_idle + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 + v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_W64: + s_mov_b32 s3, 256 + s_nop 0 + s_nop 0 + s_nop 0 +L_SAVE_LDS_LOOP_W64: + ds_read_b32 v1, v0 + s_wait_idle + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 + v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? + +L_SAVE_LDS_DONE: + /* save VGPRs - set the Rest VGPRs */ +L_SAVE_VGPR: + // VGPR SR memory offset: 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI + s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_VGPR_NORMAL +L_ENABLE_SAVE_VGPR_EXEC_HI: + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_hi, 0xFFFFFFFF +L_SAVE_VGPR_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64 + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + +L_SAVE_VGPR_W32_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3 + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? + + s_branch L_SAVE_VGPR_END + +L_SAVE_VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_SHARED_VGPR + +L_SAVE_VGPR_W64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3 + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? + +L_SAVE_SHARED_VGPR: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //save shared_vgpr will start from the index of m0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 + +L_SAVE_SHARED_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? + +L_SAVE_VGPR_END: + s_branch L_END_PGM + +L_RESTORE: + /* Setup Resource Contants */ + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + + // Save s_restore_spi_init_hi for later use. + s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi + + //determine it is wave32 or wave64 + get_wave_size2(s_restore_size) + + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ +L_RESTORE_LDS: + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_LDS_NORMAL +L_ENABLE_RESTORE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_RESTORE_LDS_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 + +L_RESTORE_LDS_LOOP_W32: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset + s_wait_idle + ds_store_addtid_b32 v0 + s_add_u32 m0, m0, 128 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? + s_branch L_RESTORE_VGPR + +L_RESTORE_LDS_LOOP_W64: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset + s_wait_idle + ds_store_addtid_b32 v0 + s_add_u32 m0, m0, 256 // 256 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? + + /* restore VGPRs */ +L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_VGPR_NORMAL +L_ENABLE_RESTORE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_RESTORE_VGPR_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 + s_mov_b32 m0, 4 //VGPR initial index value = 4 + s_cmp_lt_u32 m0, s_restore_alloc_size + s_cbranch_scc0 L_RESTORE_SGPR + +L_RESTORE_VGPR_WAVE32_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3 + s_wait_idle + v_movreld_b32 v0, v0 //v[0+m0] = v0 + v_movreld_b32 v1, v1 + v_movreld_b32 v2, v2 + v_movreld_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? + + /* VGPR restore on v0 */ + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3 + s_wait_idle + + s_branch L_RESTORE_SGPR + +L_RESTORE_VGPR_WAVE64: + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 4 + s_cmp_lt_u32 m0, s_restore_alloc_size + s_cbranch_scc0 L_RESTORE_SHARED_VGPR + +L_RESTORE_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3 + s_wait_idle + v_movreld_b32 v0, v0 //v[0+m0] = v0 + v_movreld_b32 v1, v1 + v_movreld_b32 v2, v2 + v_movreld_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + +L_RESTORE_SHARED_VGPR: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //restore shared_vgpr will start from the index of m0 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 +L_RESTORE_SHARED_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS + s_wait_idle + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + + s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! + + /* VGPR restore on v0 */ +L_RESTORE_V0: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3 + s_wait_idle + + /* restore SGPRs */ + //will be 2+8+16*6 + // SGPR SR memory offset : size(VGPR)+size(SVGPR) +L_RESTORE_SGPR: + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_mov_b32 m0, s_sgpr_save_num + + read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + + read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP + + // s_barrier with STATE_PRIV.TRAP_AFTER_INST=1, STATUS.PRIV=1 incorrectly asserts debug exception. + // Clear DEBUG_EN before and restore MODE after the barrier. + s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE), 0 + + /* restore HW registers */ +L_RESTORE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // Restore s_restore_spi_init_hi before the saved value gets clobbered. + s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch + + read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp + + // Only the first wave needs to restore the workgroup barrier. + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + + // Skip over WAVE_STATUS, since there is no state to restore from it + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4 + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_wait_idle + + s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + + // extract the saved signal count from s_restore_tmp + s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET + + // We need to call s_barrier_signal repeatedly to restore the signal + // count of the work group barrier. The member count is already + // initialized with the number of waves in the work group. +L_BARRIER_RESTORE_LOOP: + s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + s_barrier_signal -1 + s_add_i32 s_restore_tmp, s_restore_tmp, -1 + s_branch L_BARRIER_RESTORE_LOOP + +L_SKIP_BARRIER_RESTORE: + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + // EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed. + // Only restore the other fields to avoid clobbering them. + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv + s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE), s_restore_excp_flag_priv + s_lshr_b32 s_restore_excp_flag_priv, s_restore_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE), s_restore_excp_flag_priv + + s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode + + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) + get_svgpr_size_bytes(s_restore_ttmps_hi) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS + s_wait_idle + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu + + // Make barrier and LDS state visible to all waves in the group. + // STATE_PRIV.BARRIER_COMPLETE may change after this point. + s_barrier_signal -2 + s_barrier_wait -2 + + s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + +L_END_PGM: + // Make sure that no wave of the workgroup can exit the trap handler + // before the workgroup barrier state is saved. + s_barrier_signal -2 + s_barrier_wait -2 + s_endpgm_saved +end + +function write_16sgpr_to_v2(s, lane_offset) + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], sgpr_idx + lane_offset + end + valu_sgpr_hazard() + s_add_u32 ttmp13, ttmp13, 0x10 +end + +function write_12sgpr_to_v2(s) + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], sgpr_idx + end + valu_sgpr_hazard() +end + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset scope:SCOPE_SYS + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset scope:SCOPE_SYS +end + +function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*8 + s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset scope:SCOPE_SYS +end + +function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*4 + s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset scope:SCOPE_SYS +end + +function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_bitcmp1_b32 s_size, S_WAVE_SIZE + s_cbranch_scc1 L_ENABLE_SHIFT_W64 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) + s_branch L_SHIFT_DONE +L_ENABLE_SHIFT_W64: + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) +L_SHIFT_DONE: +end + +function get_svgpr_size_bytes(s_svgpr_size_byte) + s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) + s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) +end + +function get_sgpr_size_bytes + return 512 +end + +function get_hwreg_size_bytes + return 128 +end + +function get_wave_size2(s_reg) + s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) + s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE +end + +function valu_sgpr_hazard +#if HAVE_VALU_SGPR_HAZARD + for var rep = 0; rep < 8; rep ++ + ds_nop + end +#endif +end diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index bb26338204f4..6869e07a2fff 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -37,17 +37,28 @@ * gc_9_4_3: * cpp -DASIC_FAMILY=GC_9_4_3 cwsr_trap_handler_gfx9.asm -P -o gc_9_4_3.sp3 * sp3 gc_9_4_3.sp3 -hex gc_9_4_3.hex + * + * gc_9_5_0: + * cpp -DASIC_FAMILY=GC_9_5_0 cwsr_trap_handler_gfx9.asm -P -o gc_9_5_0.sp3 + * sp3 gc_9_5_0.sp3 -hex gc_9_5_0.hex */ #define CHIP_VEGAM 18 #define CHIP_ARCTURUS 23 #define CHIP_ALDEBARAN 25 #define CHIP_GC_9_4_3 26 +#define CHIP_GC_9_5_0 27 var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger var SINGLE_STEP_MISSED_WORKAROUND = (ASIC_FAMILY <= CHIP_ALDEBARAN) //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#if ASIC_FAMILY < CHIP_GC_9_4_3 +#define VMEM_MODIFIERS slc:1 glc:1 +#else +#define VMEM_MODIFIERS sc0:1 nt:1 +#endif + /**************************************************************************/ /* variables */ /**************************************************************************/ @@ -62,7 +73,13 @@ var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +#if ASIC_FAMILY >= CHIP_GC_9_5_0 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 11 +var LDS_RESTORE_GRANULARITY_BYTES = 1280 +#else var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var LDS_RESTORE_GRANULARITY_BYTES = 512 +#endif var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 @@ -430,7 +447,9 @@ L_SAVE: s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - + // Clear VSKIP state now that MODE.VSKIP has been saved. + // If user shader set it then vector instructions would be skipped. + s_setvskip 0,0 /* the first wave in the threadgroup */ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit @@ -557,12 +576,21 @@ if SAVE_AFTER_XNACK_ERROR v_lshlrev_b32 v2, 2, v3 L_SAVE_LDS_LOOP_SQC: +#if ASIC_FAMILY < CHIP_GC_9_5_0 ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 s_waitcnt lgkmcnt(0) - write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) v_add_u32 v2, 0x200, v2 +#else + // gfx950 needs to save in multiple of 256 bytes. + ds_read_b32 v0, v2 + s_waitcnt lgkmcnt(0) + write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset) + + v_add_u32 v2, 0x100, v2 +#endif + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC @@ -581,11 +609,14 @@ end L_SAVE_LDS_LOOP_VECTOR: ds_read_b64 v[0:1], v2 //x =LDS[a], byte address s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset VMEM_MODIFIERS offen:1 // s_waitcnt vmcnt(0) // v_add_u32 v2, vcc[0:1], v2, v3 v_add_u32 v2, v2, v3 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size +#if ASIC_FAMILY >= CHIP_GC_9_5_0 + s_mov_b64 exec, vcc +#endif s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR // restore rsrc3 @@ -748,8 +779,13 @@ L_RESTORE: L_RESTORE_LDS_LOOP: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - s_add_u32 m0, m0, 256*2 // 128 DW - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW +#if ASIC_FAMILY >= CHIP_GC_9_5_0 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512 // third 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768 // forth 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024 // fifth 64DW +#endif + s_add_u32 m0, m0, LDS_RESTORE_GRANULARITY_BYTES // 128/320 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES //mem offset increased by 128/320 DW s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? @@ -979,17 +1015,17 @@ L_TCP_STORE_CHECK_DONE: end function write_4vgprs_to_mem(s_rsrc, s_mem_offset) - buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 end function read_4vgprs_from_mem(s_rsrc, s_mem_offset) - buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 s_waitcnt vmcnt(0) end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dfa8c69532d4..a2149afa5803 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -36,7 +36,6 @@ #include <linux/mman.h> #include <linux/ptrace.h> #include <linux/dma-buf.h> -#include <linux/fdtable.h> #include <linux/processor.h> #include "kfd_priv.h" #include "kfd_device_queue_manager.h" @@ -156,8 +155,8 @@ static int kfd_open(struct inode *inode, struct file *filep) /* filep now owns the reference returned by kfd_create_process */ filep->private_data = process; - dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", - process->pasid, process->is_32bit_user_mode); + dev_dbg(kfd_device, "process pid %d opened kfd node, compat mode (32 bit) - %d\n", + process->lead_thread->pid, process->is_32bit_user_mode); return 0; } @@ -213,6 +212,11 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, return -EINVAL; } + if (args->ring_size < KFD_MIN_QUEUE_RING_SIZE) { + args->ring_size = KFD_MIN_QUEUE_RING_SIZE; + pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE"); + } + if (!access_ok((const void __user *) args->read_pointer_address, sizeof(uint32_t))) { pr_err("Can't access read pointer\n"); @@ -247,14 +251,15 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->priority = args->queue_priority; q_properties->queue_address = args->ring_base_address; q_properties->queue_size = args->ring_size; - q_properties->read_ptr = (uint32_t *) args->read_pointer_address; - q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + q_properties->read_ptr = (void __user *)args->read_pointer_address; + q_properties->write_ptr = (void __user *)args->write_pointer_address; q_properties->eop_ring_buffer_address = args->eop_buffer_address; q_properties->eop_ring_buffer_size = args->eop_buffer_size; q_properties->ctx_save_restore_area_address = args->ctx_save_restore_address; q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; q_properties->ctl_stack_size = args->ctl_stack_size; + q_properties->sdma_engine_id = args->sdma_engine_id; if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ -262,6 +267,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->type = KFD_QUEUE_TYPE_SDMA; else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI) q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID) + q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID; else return -ENOTSUPP; @@ -306,7 +313,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, struct kfd_process_device *pdd; struct queue_properties q_properties; uint32_t doorbell_offset_in_process = 0; - struct amdgpu_bo *wptr_bo = NULL; memset(&q_properties, 0, sizeof(struct queue_properties)); @@ -334,6 +340,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_bind_process; } + if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { + int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) + + kfd_get_num_xgmi_sdma_engines(dev) - 1; + + if (q_properties.sdma_engine_id > max_sdma_eng_id) { + err = -EINVAL; + pr_err("sdma_engine_id %i exceeds maximum id of %i\n", + q_properties.sdma_engine_id, max_sdma_eng_id); + goto err_sdma_engine_id; + } + } + if (!pdd->qpd.proc_doorbells) { err = kfd_alloc_process_doorbells(dev->kfd, pdd); if (err) { @@ -342,48 +360,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, } } - /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work - * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) - */ - if (dev->kfd->shared_resources.enable_mes && - ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) - >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { - struct amdgpu_bo_va_mapping *wptr_mapping; - struct amdgpu_vm *wptr_vm; - - wptr_vm = drm_priv_to_vm(pdd->drm_priv); - err = amdgpu_bo_reserve(wptr_vm->root.bo, false); - if (err) - goto err_wptr_map_gart; - - wptr_mapping = amdgpu_vm_bo_lookup_mapping( - wptr_vm, args->write_pointer_address >> PAGE_SHIFT); - amdgpu_bo_unreserve(wptr_vm->root.bo); - if (!wptr_mapping) { - pr_err("Failed to lookup wptr bo\n"); - err = -EINVAL; - goto err_wptr_map_gart; - } - - wptr_bo = wptr_mapping->bo_va->base.bo; - if (wptr_bo->tbo.base.size > PAGE_SIZE) { - pr_err("Requested GART mapping for wptr bo larger than one page\n"); - err = -EINVAL; - goto err_wptr_map_gart; - } - - err = amdgpu_amdkfd_map_gtt_bo_to_gart(wptr_bo); - if (err) { - pr_err("Failed to map wptr bo to GART\n"); - goto err_wptr_map_gart; - } + err = kfd_queue_acquire_buffers(pdd, &q_properties); + if (err) { + pr_debug("failed to acquire user queue buffers\n"); + goto err_acquire_queue_buf; } - pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n", - p->pasid, + pr_debug("Creating queue for process pid %d on gpu 0x%x\n", + p->lead_thread->pid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo, + err = pqm_create_queue(&p->pqm, dev, &q_properties, &queue_id, NULL, NULL, NULL, &doorbell_offset_in_process); if (err != 0) goto err_create_queue; @@ -417,9 +404,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return 0; err_create_queue: - if (wptr_bo) - amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); -err_wptr_map_gart: + kfd_queue_unref_bo_vas(pdd, &q_properties); + kfd_queue_release_buffers(pdd, &q_properties); +err_acquire_queue_buf: +err_sdma_engine_id: err_bind_process: err_pdd: mutex_unlock(&p->mutex); @@ -432,9 +420,9 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, int retval; struct kfd_ioctl_destroy_queue_args *args = data; - pr_debug("Destroying queue id %d for pasid 0x%x\n", + pr_debug("Destroying queue id %d for process pid %d\n", args->queue_id, - p->pasid); + p->lead_thread->pid); mutex_lock(&p->mutex); @@ -478,6 +466,11 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, return -EINVAL; } + if (args->ring_size < KFD_MIN_QUEUE_RING_SIZE) { + args->ring_size = KFD_MIN_QUEUE_RING_SIZE; + pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE"); + } + properties.queue_address = args->ring_base_address; properties.queue_size = args->ring_size; properties.queue_percent = args->queue_percentage & 0xFF; @@ -485,8 +478,8 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, properties.pm4_target_xcc = (args->queue_percentage >> 8) & 0xFF; properties.priority = args->queue_priority; - pr_debug("Updating queue id %d for pasid 0x%x\n", - args->queue_id, p->pasid); + pr_debug("Updating queue id %d for process pid %d\n", + args->queue_id, p->lead_thread->pid); mutex_lock(&p->mutex); @@ -613,7 +606,8 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, default_policy, alternate_policy, (void __user *)args->alternate_aperture_base, - args->alternate_aperture_size)) + args->alternate_aperture_size, + args->misc_process_flag)) err = -EINVAL; out: @@ -712,7 +706,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, struct kfd_process_device_apertures *pAperture; int i; - dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid); + dev_dbg(kfd_device, "get apertures for process pid %d", p->lead_thread->pid); args->num_of_nodes = 0; @@ -764,7 +758,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, int ret; int i; - dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid); + dev_dbg(kfd_device, "get apertures for process pid %d", + p->lead_thread->pid); if (args->num_of_nodes == 0) { /* Return number of nodes, so that user space can alloacate @@ -779,8 +774,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, * nodes, but not more than args->num_of_nodes as that is * the amount of memory allocated by user */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); + pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures), + GFP_KERNEL); if (!pa) return -ENOMEM; @@ -1139,7 +1134,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, goto err_unlock; } offset = dev->adev->rmmio_remap.bus_addr; - if (!offset) { + if (!offset || (PAGE_SIZE > 4096)) { err = -ENOMEM; goto err_unlock; } @@ -1165,7 +1160,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) size >>= 1; - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + PAGE_ALIGN(size)); + atomic64_add(PAGE_ALIGN(size), &pdd->vram_usage); } mutex_unlock(&p->mutex); @@ -1236,7 +1231,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); + atomic64_sub(size, &pdd->vram_usage); err_unlock: err_pdd: @@ -1417,8 +1412,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); if (err) { - pr_err("Failed to unmap from gpu %d/%d\n", - i, args->n_devices); + pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices); goto unmap_memory_from_gpu_failed; } args->n_success = i+1; @@ -1523,7 +1517,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, /* Find a KFD GPU device that supports the get_dmabuf_info query */ for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) - if (dev) + if (dev && !kfd_devcgroup_check_permission(dev)) break; if (!dev) return -EINVAL; @@ -1545,7 +1539,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep, if (xcp_id >= 0) args->gpu_id = dmabuf_adev->kfd.dev->nodes[xcp_id]->id; else - args->gpu_id = dmabuf_adev->kfd.dev->nodes[0]->id; + args->gpu_id = dev->id; args->flags = flags; /* Copy metadata buffer to user mode */ @@ -1852,7 +1846,8 @@ static uint32_t get_process_num_bos(struct kfd_process *p) } static int criu_get_prime_handle(struct kgd_mem *mem, - int flags, u32 *shared_fd) + int flags, u32 *shared_fd, + struct file **file) { struct dma_buf *dmabuf; int ret; @@ -1863,13 +1858,14 @@ static int criu_get_prime_handle(struct kgd_mem *mem, return ret; } - ret = dma_buf_fd(dmabuf, flags); + ret = get_unused_fd_flags(flags); if (ret < 0) { pr_err("dmabuf create fd failed, ret:%d\n", ret); goto out_free_dmabuf; } *shared_fd = ret; + *file = dmabuf->file; return 0; out_free_dmabuf: @@ -1877,6 +1873,25 @@ out_free_dmabuf: return ret; } +static void commit_files(struct file **files, + struct kfd_criu_bo_bucket *bo_buckets, + unsigned int count, + int err) +{ + while (count--) { + struct file *file = files[count]; + + if (!file) + continue; + if (err) { + fput(file); + put_unused_fd(bo_buckets[count].dmabuf_fd); + } else { + fd_install(bo_buckets[count].dmabuf_fd, file); + } + } +} + static int criu_checkpoint_bos(struct kfd_process *p, uint32_t num_bos, uint8_t __user *user_bos, @@ -1885,6 +1900,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, { struct kfd_criu_bo_bucket *bo_buckets; struct kfd_criu_bo_priv_data *bo_privs; + struct file **files = NULL; int ret = 0, pdd_index, bo_index = 0, id; void *mem; @@ -1898,6 +1914,12 @@ static int criu_checkpoint_bos(struct kfd_process *p, goto exit; } + files = kvzalloc(num_bos * sizeof(struct file *), GFP_KERNEL); + if (!files) { + ret = -ENOMEM; + goto exit; + } + for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) { struct kfd_process_device *pdd = p->pdds[pdd_index]; struct amdgpu_bo *dumper_bo; @@ -1908,11 +1930,6 @@ static int criu_checkpoint_bos(struct kfd_process *p, struct kfd_criu_bo_priv_data *bo_priv; int i, dev_idx = 0; - if (!mem) { - ret = -ENOMEM; - goto exit; - } - kgd_mem = (struct kgd_mem *)mem; dumper_bo = kgd_mem->bo; @@ -1945,7 +1962,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, ret = criu_get_prime_handle(kgd_mem, bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0, - &bo_bucket->dmabuf_fd); + &bo_bucket->dmabuf_fd, &files[bo_index]); if (ret) goto exit; } else { @@ -1963,7 +1980,7 @@ static int criu_checkpoint_bos(struct kfd_process *p, bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); for (i = 0; i < p->n_pdds; i++) { - if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->drm_priv, kgd_mem)) bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; } @@ -1996,12 +2013,8 @@ static int criu_checkpoint_bos(struct kfd_process *p, *priv_offset += num_bos * sizeof(*bo_privs); exit: - while (ret && bo_index--) { - if (bo_buckets[bo_index].alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) - close_fd(bo_buckets[bo_index].dmabuf_fd); - } - + commit_files(files, bo_buckets, bo_index, ret); + kvfree(files); kvfree(bo_buckets); kvfree(bo_privs); return ret; @@ -2026,9 +2039,7 @@ static int criu_get_process_object_info(struct kfd_process *p, num_events = kfd_get_num_events(p); - ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); - if (ret) - return ret; + svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); *num_objects = num_queues + num_events + num_svm_ranges; @@ -2307,7 +2318,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, return -EINVAL; } offset = pdd->dev->adev->rmmio_remap.bus_addr; - if (!offset) { + if (!offset || (PAGE_SIZE > 4096)) { pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n"); return -ENOMEM; } @@ -2346,14 +2357,15 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { bo_bucket->restored_offset = offset; /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + atomic64_add(bo_bucket->size, &pdd->vram_usage); } return 0; } static int criu_restore_bo(struct kfd_process *p, struct kfd_criu_bo_bucket *bo_bucket, - struct kfd_criu_bo_priv_data *bo_priv) + struct kfd_criu_bo_priv_data *bo_priv, + struct file **file) { struct kfd_process_device *pdd; struct kgd_mem *kgd_mem; @@ -2405,7 +2417,7 @@ static int criu_restore_bo(struct kfd_process *p, if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { ret = criu_get_prime_handle(kgd_mem, DRM_RDWR, - &bo_bucket->dmabuf_fd); + &bo_bucket->dmabuf_fd, file); if (ret) return ret; } else { @@ -2422,6 +2434,7 @@ static int criu_restore_bos(struct kfd_process *p, { struct kfd_criu_bo_bucket *bo_buckets = NULL; struct kfd_criu_bo_priv_data *bo_privs = NULL; + struct file **files = NULL; int ret = 0; uint32_t i = 0; @@ -2435,6 +2448,12 @@ static int criu_restore_bos(struct kfd_process *p, if (!bo_buckets) return -ENOMEM; + files = kvzalloc(args->num_bos * sizeof(struct file *), GFP_KERNEL); + if (!files) { + ret = -ENOMEM; + goto exit; + } + ret = copy_from_user(bo_buckets, (void __user *)args->bos, args->num_bos * sizeof(*bo_buckets)); if (ret) { @@ -2460,7 +2479,7 @@ static int criu_restore_bos(struct kfd_process *p, /* Create and map new BOs */ for (; i < args->num_bos; i++) { - ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]); + ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i], &files[i]); if (ret) { pr_debug("Failed to restore BO[%d] ret%d\n", i, ret); goto exit; @@ -2475,11 +2494,8 @@ static int criu_restore_bos(struct kfd_process *p, ret = -EFAULT; exit: - while (ret && i--) { - if (bo_buckets[i].alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) - close_fd(bo_buckets[i].dmabuf_fd); - } + commit_files(files, bo_buckets, i, ret); + kvfree(files); kvfree(bo_buckets); kvfree(bo_privs); return ret; @@ -3349,6 +3365,9 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process, if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; + if (PAGE_SIZE > 4096) + return -EINVAL; + address = dev->adev->rmmio_remap.bus_addr; vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | @@ -3356,12 +3375,12 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process, vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pr_debug("pasid 0x%x mapping mmio page\n" + pr_debug("process pid %d mapping mmio page\n" " target user address == 0x%08llX\n" " physical address == 0x%08llX\n" " vm_flags == 0x%04lX\n" " size == 0x%04lX\n", - process->pasid, (unsigned long long) vma->vm_start, + process->lead_thread->pid, (unsigned long long) vma->vm_start, address, vma->vm_flags, PAGE_SIZE); return io_remap_pfn_range(vma, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 7f2ae0d15d4a..4a7180b46b71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -28,6 +28,7 @@ #include "kfd_topology.h" #include "amdgpu.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_xgmi.h" /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. * GPU processor ID are expressed with Bit[31]=1. @@ -1422,6 +1423,7 @@ err: static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, + bool cache_line_size_missing, struct kfd_gpu_cache_info *pcache_info) { struct amdgpu_device *adev = kdev->adev; @@ -1434,7 +1436,10 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); - pcache_info[0].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; + pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; + pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* Scalar L1 Instruction Cache per SQC */ @@ -1446,6 +1451,9 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; + pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* Scalar L1 Data Cache per SQC */ @@ -1456,6 +1464,9 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; + pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 64; i++; } /* GL1 Data Cache per SA */ @@ -1468,6 +1479,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; + if (cache_line_size_missing) + pcache_info[i].cache_line_size = 128; i++; } /* L2 Data Cache per GPU (Total Tex Cache) */ @@ -1478,6 +1491,9 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; + pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size; + if (cache_line_size_missing && !pcache_info[i].cache_line_size) + pcache_info[i].cache_line_size = 128; i++; } /* L3 Data Cache per GPU */ @@ -1488,6 +1504,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; + pcache_info[i].cache_line_size = 64; i++; } return i; @@ -1503,6 +1520,8 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gfx.config.gc_tcp_size_per_cu) { pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu; pcache_info[i].cache_level = 1; + /* Cacheline size not available in IP discovery for gc943,gc944 */ + pcache_info[i].cache_line_size = 128; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1514,6 +1533,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, pcache_info[i].cache_size = adev->gfx.config.gc_l1_instruction_cache_size_per_sqc; pcache_info[i].cache_level = 1; + pcache_info[i].cache_line_size = 64; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_INST_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1524,6 +1544,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) { pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc; pcache_info[i].cache_level = 1; + pcache_info[i].cache_line_size = 64; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1534,6 +1555,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gfx.config.gc_tcc_size) { pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size; pcache_info[i].cache_level = 2; + pcache_info[i].cache_line_size = 128; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1544,6 +1566,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, if (adev->gmc.mall_size) { pcache_info[i].cache_size = adev->gmc.mall_size / 1024; pcache_info[i].cache_level = 3; + pcache_info[i].cache_line_size = 64; pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | CRAT_CACHE_FLAGS_DATA_CACHE | CRAT_CACHE_FLAGS_SIMD_CACHE); @@ -1556,6 +1579,7 @@ static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info) { int num_of_cache_types = 0; + bool cache_line_size_missing = false; switch (kdev->adev->asic_type) { case CHIP_KAVERI: @@ -1614,6 +1638,8 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info); break; case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + case IP_VERSION(9, 5, 0): num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd, *pcache_info); @@ -1677,8 +1703,19 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc case IP_VERSION(11, 0, 4): case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): + case IP_VERSION(11, 5, 2): + case IP_VERSION(11, 5, 3): + /* Cacheline size not available in IP discovery for gc11. + * kfd_fill_gpu_cache_info_from_gfx_config to hard code it + */ + cache_line_size_missing = true; + fallthrough; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): num_of_cache_types = - kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, *pcache_info); + kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, + cache_line_size_missing, + *pcache_info); break; default: *pcache_info = dummy_cache_info; @@ -2096,9 +2133,6 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3); int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT : KFD_CRAT_INTRA_SOCKET_WEIGHT; - uint32_t bandwidth = ext_cpu ? amdgpu_amdkfd_get_xgmi_bandwidth_mbytes( - kdev->adev, NULL, true) : mem_bw; - /* * with host gpu xgmi link, host can access gpu memory whether * or not pcie bar type is large, so always create bidirectional @@ -2107,8 +2141,16 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; sub_type_hdr->weight_xgmi = weight; - sub_type_hdr->minimum_bandwidth_mbs = bandwidth; - sub_type_hdr->maximum_bandwidth_mbs = bandwidth; + if (ext_cpu) { + amdgpu_xgmi_get_bandwidth(kdev->adev, NULL, + AMDGPU_XGMI_BW_MODE_PER_LINK, + AMDGPU_XGMI_BW_UNIT_MBYTES, + &sub_type_hdr->minimum_bandwidth_mbs, + &sub_type_hdr->maximum_bandwidth_mbs); + } else { + sub_type_hdr->minimum_bandwidth_mbs = mem_bw; + sub_type_hdr->maximum_bandwidth_mbs = mem_bw; + } } else { sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; sub_type_hdr->minimum_bandwidth_mbs = @@ -2161,12 +2203,12 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, if (use_ta_info) { sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT * - amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, peer_kdev->adev); - sub_type_hdr->maximum_bandwidth_mbs = - amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, - peer_kdev->adev, false); - sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ? - amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, NULL, true) : 0; + amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev); + amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev, + AMDGPU_XGMI_BW_MODE_PER_PEER, + AMDGPU_XGMI_BW_UNIT_MBYTES, + &sub_type_hdr->minimum_bandwidth_mbs, + &sub_type_hdr->maximum_bandwidth_mbs); } else { bool is_single_hop = kdev->kfd == peer_kdev->kfd; int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT : @@ -2210,9 +2252,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * Modify length and total_entries as subunits are added. */ avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - memset(crat_table, 0, sizeof(struct crat_header)); memcpy(&crat_table->signature, CRAT_SIGNATURE, @@ -2226,9 +2265,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * First fill in the sub type header and then sub type data */ avail_size -= sizeof(struct crat_subtype_computeunit); - if (avail_size < 0) - return -ENOMEM; - sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); @@ -2325,6 +2361,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, continue; if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id) continue; + if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev)) + continue; sub_type_hdr = (typeof(sub_type_hdr))( (char *)sub_type_hdr + sizeof(struct crat_subtype_iolink)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 300634b9f668..a8ca7ecb6d27 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -42,8 +42,6 @@ #define CRAT_OEMTABLEID_LENGTH 8 #define CRAT_RESERVED_LENGTH 6 -#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) - /* Compute Unit flags */ #define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ #define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index d889e3545120..ba99e0f258ae 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -25,6 +25,7 @@ #include "kfd_topology.h" #include <linux/file.h> #include <uapi/linux/kfd_ioctl.h> +#include <uapi/linux/kfd_sysfs.h> #define MAX_WATCH_ADDRESSES 4 @@ -103,7 +104,8 @@ void debug_event_write_work_handler(struct work_struct *work) struct kfd_process, debug_event_workarea); - kernel_write(process->dbg_ev_file, &write_data, 1, &pos); + if (process->debug_trap_enabled && process->dbg_ev_file) + kernel_write(process->dbg_ev_file, &write_data, 1, &pos); } /* update process/device/queue exception status, write to descriptor @@ -202,11 +204,12 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, size_t exception_data_size) { struct kfd_process *p; + struct kfd_process_device *pdd = NULL; bool signaled_to_debugger_or_runtime = false; - p = kfd_lookup_process_by_pasid(pasid); + p = kfd_lookup_process_by_pasid(pasid, &pdd); - if (!p) + if (!pdd) return false; if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, @@ -236,9 +239,8 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, mutex_unlock(&p->mutex); } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { - kfd_dqm_evict_pasid(dev->dqm, p->pasid); - kfd_signal_vm_fault_event(dev, p->pasid, NULL, - exception_data); + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, exception_data); signaled_to_debugger_or_runtime = true; } @@ -274,8 +276,8 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, data = (struct kfd_hsa_memory_exception_data *) pdd->vm_fault_exc_data; - kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); - kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, data); error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); } @@ -348,10 +350,27 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) { uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; uint32_t flags = pdd->process->dbg_flags; + struct amdgpu_device *adev = pdd->dev->adev; + int r; if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) return 0; + if (!pdd->proc_ctx_cpu_ptr) { + r = amdgpu_amdkfd_alloc_gtt_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (r) { + dev_err(adev->dev, + "failed to allocate process context bo\n"); + return r; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, pdd->watch_points, flags, sq_trap_en); } @@ -363,47 +382,47 @@ static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_i *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; - spin_lock(&pdd->dev->kfd->watch_points_lock); + spin_lock(&pdd->dev->watch_points_lock); for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { /* device watchpoint in use so skip */ - if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) + if ((pdd->dev->alloc_watch_ids >> i) & 0x1) continue; pdd->alloc_watch_ids |= 0x1 << i; - pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; + pdd->dev->alloc_watch_ids |= 0x1 << i; *watch_id = i; - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); return 0; } - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); return -ENOMEM; } static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) { - spin_lock(&pdd->dev->kfd->watch_points_lock); + spin_lock(&pdd->dev->watch_points_lock); /* process owns device watch point so safe to clear */ if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { pdd->alloc_watch_ids &= ~(0x1 << watch_id); - pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); + pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id); } - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); } static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) { bool owns_watch_id = false; - spin_lock(&pdd->dev->kfd->watch_points_lock); + spin_lock(&pdd->dev->watch_points_lock); owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && ((pdd->alloc_watch_ids >> watch_id) & 0x1); - spin_unlock(&pdd->dev->kfd->watch_points_lock); + spin_unlock(&pdd->dev->watch_points_lock); return owns_watch_id; } @@ -497,14 +516,24 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) int i, r = 0, rewind_count = 0; for (i = 0; i < target->n_pdds; i++) { - if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) && + struct kfd_topology_device *topo_dev = + kfd_topology_device_by_id(target->pdds[i]->dev->id); + uint32_t caps = topo_dev->node_props.capability; + + if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { *flags = prev_flags; return -EACCES; } + + if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) && + (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) { + *flags = prev_flags; + return -EACCES; + } } - target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; + target->dbg_flags = *flags; *flags = prev_flags; for (i = 0; i < target->n_pdds; i++) { struct kfd_process_device *pdd = target->pdds[i]; @@ -645,6 +674,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target) else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; + cancel_work_sync(&target->debug_event_workarea); fput(target->dbg_ev_file); target->dbg_ev_file = NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h index fd0ff64d4184..27aa1a5b120f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h @@ -78,6 +78,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev) { return (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) || KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0) || KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)); } @@ -134,6 +136,7 @@ static inline bool kfd_dbg_has_ttmps_always_setup(struct kfd_node *dev) KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 2)) || (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) && KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0) && - (dev->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 70); + (dev->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 70) || + (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c index 4a5a0a4e00f2..9bde2c64540f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -27,6 +27,16 @@ #include "kfd_priv.h" static struct dentry *debugfs_root; +static struct dentry *debugfs_proc; +static struct list_head procs; + +struct debugfs_proc_entry { + struct list_head list; + struct dentry *proc_dentry; + pid_t pid; +}; + +#define MAX_DEBUGFS_FILENAME_LEN 32 static int kfd_debugfs_open(struct inode *inode, struct file *file) { @@ -92,6 +102,8 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = { void kfd_debugfs_init(void) { debugfs_root = debugfs_create_dir("kfd", NULL); + debugfs_proc = debugfs_create_dir("proc", debugfs_root); + INIT_LIST_HEAD(&procs); debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, kfd_debugfs_mqds_by_process, &kfd_debugfs_fops); @@ -107,5 +119,69 @@ void kfd_debugfs_init(void) void kfd_debugfs_fini(void) { + debugfs_remove_recursive(debugfs_proc); debugfs_remove_recursive(debugfs_root); } + +static ssize_t kfd_debugfs_pasid_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kfd_process_device *pdd = file_inode(file)->i_private; + char tmp[32]; + int len; + + len = snprintf(tmp, sizeof(tmp), "%u\n", pdd->pasid); + + return simple_read_from_buffer(buf, count, ppos, tmp, len); +} + +static const struct file_operations kfd_debugfs_pasid_fops = { + .owner = THIS_MODULE, + .read = kfd_debugfs_pasid_read, +}; + +void kfd_debugfs_add_process(struct kfd_process *p) +{ + int i; + char name[MAX_DEBUGFS_FILENAME_LEN]; + struct debugfs_proc_entry *entry; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + list_add(&entry->list, &procs); + entry->pid = p->lead_thread->pid; + snprintf(name, MAX_DEBUGFS_FILENAME_LEN, "%d", + (int)entry->pid); + entry->proc_dentry = debugfs_create_dir(name, debugfs_proc); + + /* Create debugfs files for each GPU: + * - proc/<pid>/pasid_<gpuid> + */ + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + + snprintf(name, MAX_DEBUGFS_FILENAME_LEN, "pasid_%u", + pdd->dev->id); + debugfs_create_file((const char *)name, S_IFREG | 0444, + entry->proc_dentry, pdd, + &kfd_debugfs_pasid_fops); + } +} + +void kfd_debugfs_remove_process(struct kfd_process *p) +{ + struct debugfs_proc_entry *entry, *next; + + mutex_lock(&kfd_processes_mutex); + list_for_each_entry_safe(entry, next, &procs, list) { + if (entry->pid != p->lead_thread->pid) + continue; + + debugfs_remove_recursive(entry->proc_dentry); + list_del(&entry->list); + kfree(entry); + } + mutex_unlock(&kfd_processes_mutex); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e7..bf0854bd5555 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -56,6 +56,7 @@ extern const struct kfd2kgd_calls gc_9_4_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v11_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v12_kfd2kgd; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); @@ -83,6 +84,8 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(4, 2, 2):/* ARCTURUS */ case IP_VERSION(4, 4, 0):/* ALDEBARAN */ case IP_VERSION(4, 4, 2): + case IP_VERSION(4, 4, 5): + case IP_VERSION(4, 4, 4): case IP_VERSION(5, 0, 0):/* NAVI10 */ case IP_VERSION(5, 0, 1):/* CYAN_SKILLFISH */ case IP_VERSION(5, 0, 2):/* NAVI14 */ @@ -97,6 +100,10 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 3): case IP_VERSION(6, 1, 0): case IP_VERSION(6, 1, 1): + case IP_VERSION(6, 1, 2): + case IP_VERSION(6, 1, 3): + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): kfd->device_info.num_sdma_queues_per_engine = 8; break; default: @@ -115,6 +122,10 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 3): case IP_VERSION(6, 1, 0): case IP_VERSION(6, 1, 1): + case IP_VERSION(6, 1, 2): + case IP_VERSION(6, 1, 3): + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): /* Reserve 1 for paging and 1 for gfx */ kfd->device_info.num_reserved_sdma_queues_per_engine = 2; /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */ @@ -143,6 +154,8 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) kfd->device_info.event_interrupt_class = &event_interrupt_class_v9; break; case IP_VERSION(9, 4, 3): /* GC 9.4.3 */ + case IP_VERSION(9, 4, 4): /* GC 9.4.4 */ + case IP_VERSION(9, 5, 0): /* GC 9.5.0 */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v9_4_3; break; @@ -168,6 +181,13 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) case IP_VERSION(11, 0, 4): case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): + case IP_VERSION(11, 5, 2): + case IP_VERSION(11, 5, 3): + kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; + break; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + /* GFX12_TODO: Change to v12 version. */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; default: @@ -220,6 +240,11 @@ static void kfd_device_info_init(struct kfd_dev *kfd, */ kfd->device_info.needs_pci_atomics = true; kfd->device_info.no_atomic_fw_version = kfd->adev->gfx.rs64_enable ? 509 : 0; + } else if (gc_version < IP_VERSION(13, 0, 0)) { + kfd->device_info.needs_pci_atomics = true; + kfd->device_info.no_atomic_fw_version = 2090; + } else { + kfd->device_info.needs_pci_atomics = true; } } else { kfd->device_info.doorbell_size = 4; @@ -327,9 +352,12 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &aldebaran_kfd2kgd; break; case IP_VERSION(9, 4, 3): - gfx_target_version = adev->rev_id >= 1 ? 90402 - : adev->flags & AMD_IS_APU ? 90400 - : 90401; + case IP_VERSION(9, 4, 4): + gfx_target_version = 90402; + f2g = &gc_9_4_3_kfd2kgd; + break; + case IP_VERSION(9, 5, 0): + gfx_target_version = 90500; f2g = &gc_9_4_3_kfd2kgd; break; /* Navi10 */ @@ -408,15 +436,8 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 0, 3): - if ((adev->pdev->device == 0x7460 && - adev->pdev->revision == 0x00) || - (adev->pdev->device == 0x7461 && - adev->pdev->revision == 0x00)) - /* Note: Compiler version is 11.0.5 while HW version is 11.0.3 */ - gfx_target_version = 110005; - else - /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ - gfx_target_version = 110001; + /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ + gfx_target_version = 110001; f2g = &gfx_v11_kfd2kgd; break; case IP_VERSION(11, 5, 0): @@ -427,6 +448,22 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) gfx_target_version = 110501; f2g = &gfx_v11_kfd2kgd; break; + case IP_VERSION(11, 5, 2): + gfx_target_version = 110502; + f2g = &gfx_v11_kfd2kgd; + break; + case IP_VERSION(11, 5, 3): + gfx_target_version = 110503; + f2g = &gfx_v11_kfd2kgd; + break; + case IP_VERSION(12, 0, 0): + gfx_target_version = 120000; + f2g = &gfx_v12_kfd2kgd; + break; + case IP_VERSION(12, 0, 1): + gfx_target_version = 120001; + f2g = &gfx_v12_kfd2kgd; + break; default: break; } @@ -435,12 +472,12 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) if (!f2g) { if (amdgpu_ip_version(adev, GC_HWIP, 0)) - dev_err(kfd_device, + dev_info(kfd_device, "GC IP %06x %s not supported in kfd\n", amdgpu_ip_version(adev, GC_HWIP, 0), vf ? "VF" : ""); else - dev_err(kfd_device, "%s %s not supported in kfd\n", + dev_info(kfd_device, "%s %s not supported in kfd\n", amdgpu_asic_name[adev->asic_type], vf ? "VF" : ""); return NULL; } @@ -480,11 +517,16 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_aldebaran_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_aldebaran_hex); - } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) { + } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 4)) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx9_4_3_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_4_3_hex); + } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 5, 0)) { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_5_0_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx9_5_0_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_5_0_hex); } else if (KFD_GC_VERSION(kfd) < IP_VERSION(10, 1, 1)) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > KFD_CWSR_TMA_OFFSET); @@ -500,12 +542,17 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx10_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); - } else { + } else if (KFD_GC_VERSION(kfd) < IP_VERSION(12, 0, 0)) { /* The gfx11 cwsr trap handler must fit inside a single page. */ BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx11_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_hex) + > KFD_CWSR_TMA_OFFSET); + kfd->cwsr_isa = cwsr_trap_gfx12_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_hex); } kfd->cwsr_enabled = true; @@ -530,15 +577,21 @@ static int kfd_gws_init(struct kfd_node *node) && kfd->mec2_fw_version >= 0x30) || (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 2) && kfd->mec2_fw_version >= 0x28) || - (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) || + (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(node) == IP_VERSION(9, 4, 4)) || + (KFD_GC_VERSION(node) == IP_VERSION(9, 5, 0)) || (KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0) && KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0) && kfd->mec2_fw_version >= 0x6b) || (KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0) && KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0) - && mes_rev >= 68)))) + && mes_rev >= 68) || + (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0))))) { + if (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0)) + node->adev->gds.gws_size = 64; ret = amdgpu_amdkfd_alloc_gws(node->adev, node->adev->gds.gws_size, &node->gws); + } return ret; } @@ -602,6 +655,14 @@ static void kfd_cleanup_nodes(struct kfd_dev *kfd, unsigned int num_nodes) struct kfd_node *knode; unsigned int i; + /* + * flush_work ensures that there are no outstanding + * work-queue items that will access interrupt_ring. New work items + * can't be created because we stopped interrupt handling above. + */ + flush_workqueue(kfd->ih_wq); + destroy_workqueue(kfd->ih_wq); + for (i = 0; i < num_nodes; i++) { knode = kfd->nodes[i]; device_queue_manager_uninit(knode->dqm); @@ -697,14 +758,14 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1; vmid_num_kfd = last_vmid_kfd - first_vmid_kfd + 1; - /* For GFX9.4.3, we need special handling for VMIDs depending on - * partition mode. + /* For multi-partition capable GPUs, we need special handling for VMIDs + * depending on partition mode. * In CPX mode, the VMID range needs to be shared between XCDs. * Additionally, there are 13 VMIDs (3-15) available for KFD. To * divide them equally, we change starting VMID to 4 and not use * VMID 3. - * If the VMID range changes for GFX9.4.3, then this code MUST be - * revisited. + * If the VMID range changes for multi-partition capable GPUs, then + * this code MUST be revisited. */ if (kfd->adev->xcp_mgr) { partition_mode = amdgpu_xcp_query_partition_mode(kfd->adev->xcp_mgr, @@ -769,11 +830,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd->hive_id = kfd->adev->gmc.xgmi.hive_id; /* - * For GFX9.4.3, the KFD abstracts all partitions within a socket as - * xGMI connected in the topology so assign a unique hive id per - * device based on the pci device location if device is in PCIe mode. + * For multi-partition capable GPUs, the KFD abstracts all partitions + * within a socket as xGMI connected in the topology so assign a unique + * hive id per device based on the pci device location if device is in + * PCIe mode. */ - if (!kfd->hive_id && (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) && kfd->num_nodes > 1) + if (!kfd->hive_id && kfd->num_nodes > 1) kfd->hive_id = pci_dev_id(kfd->adev->pdev); kfd->noretry = kfd->adev->gmc.noretry; @@ -811,11 +873,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, KFD_XCP_MEMORY_SIZE(node->adev, node->node_id) >> 20); } - if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) && - partition_mode == AMDGPU_CPX_PARTITION_MODE && + if (partition_mode == AMDGPU_CPX_PARTITION_MODE && kfd->num_nodes != 1) { - /* For GFX9.4.3 and CPX mode, first XCD gets VMID range - * 4-9 and second XCD gets VMID range 10-15. + /* For multi-partition capable GPUs and CPX mode, first + * XCD gets VMID range 4-9 and second XCD gets VMID + * range 10-15. */ node->vm_info.first_vmid_kfd = (i%2 == 0) ? @@ -839,7 +901,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, amdgpu_amdkfd_get_local_mem_info(kfd->adev, &node->local_mem_info, node->xcp); - if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) + if (kfd->adev->xcp_mgr) kfd_setup_interrupt_bitmap(node, i); /* Initialize the KFD node */ @@ -847,13 +909,14 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, dev_err(kfd_device, "Error initializing KFD node\n"); goto node_init_error; } + + spin_lock_init(&node->watch_points_lock); + kfd->nodes[i] = node; } svm_range_set_max_pages(kfd->adev); - spin_lock_init(&kfd->watch_points_lock); - kfd->init_complete = true; dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor, kfd->adev->pdev->device); @@ -870,7 +933,7 @@ node_alloc_error: kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); alloc_gtt_mem_failure: dev_err(kfd_device, "device %x:%x NOT added due to errors\n", @@ -888,13 +951,14 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); ida_destroy(&kfd->doorbell_ida); kfd_gtt_sa_fini(kfd); - amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); } kfree(kfd); } -int kgd2kfd_pre_reset(struct kfd_dev *kfd) +int kgd2kfd_pre_reset(struct kfd_dev *kfd, + struct amdgpu_reset_context *reset_context) { struct kfd_node *node; int i; @@ -904,8 +968,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; - kfd_smi_event_update_gpu_reset(node, false); - node->dqm->ops.pre_reset(node->dqm); + kfd_smi_event_update_gpu_reset(node, false, reset_context); } kgd2kfd_suspend(kfd, false); @@ -944,7 +1007,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; atomic_set(&node->sram_ecc_flag, 0); - kfd_smi_event_update_gpu_reset(node, true); + kfd_smi_event_update_gpu_reset(node, true, NULL); } return 0; @@ -960,7 +1023,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { struct kfd_node *node; int i; - int count; if (!kfd->init_complete) return; @@ -968,12 +1030,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) { @@ -984,7 +1044,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count, i; + int ret, i; if (!kfd->init_complete) return 0; @@ -998,12 +1058,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); } return ret; @@ -1022,21 +1080,6 @@ static int kfd_resume(struct kfd_node *node) return err; } -static inline void kfd_queue_work(struct workqueue_struct *wq, - struct work_struct *work) -{ - int cpu, new_cpu; - - cpu = new_cpu = smp_processor_id(); - do { - new_cpu = cpumask_next(new_cpu, cpu_online_mask) % nr_cpu_ids; - if (cpu_to_node(new_cpu) == numa_node_id()) - break; - } while (cpu != new_cpu); - - queue_work_on(new_cpu, wq, work); -} - /* This is called directly from KGD at ISR. */ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) { @@ -1062,7 +1105,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) patched_ihre, &is_patched) && enqueue_ih_ring_entry(node, is_patched ? patched_ihre : ih_ring_entry)) { - kfd_queue_work(node->ih_wq, &node->interrupt_work); + queue_work(node->kfd->ih_wq, &node->interrupt_work); spin_unlock_irqrestore(&node->interrupt_lock, flags); return; } @@ -1359,6 +1402,13 @@ void kfd_dec_compute_active(struct kfd_node *node) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +static bool kfd_compute_active(struct kfd_node *node) +{ + if (atomic_read(&node->kfd->compute_profile)) + return true; + return false; +} + void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask) { /* @@ -1413,6 +1463,130 @@ void kgd2kfd_unlock_kfd(void) mutex_unlock(&kfd_processes_mutex); } +int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + int ret; + + if (!kfd->init_complete) + return 0; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return -EINVAL; + } + node = kfd->nodes[node_id]; + + ret = node->dqm->ops.unhalt(node->dqm); + if (ret) + dev_err(kfd_device, "Error in starting scheduler\n"); + + return ret; +} + +int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + + if (!kfd->init_complete) + return 0; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return -EINVAL; + } + + node = kfd->nodes[node_id]; + return node->dqm->ops.halt(node->dqm); +} + +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + + if (!kfd->init_complete) + return false; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return false; + } + + node = kfd->nodes[node_id]; + + return kfd_compute_active(node); +} + +/** + * kgd2kfd_vmfault_fast_path() - KFD vm page fault interrupt handling fast path for gmc v9 + * @adev: amdgpu device + * @entry: vm fault interrupt vector + * @retry_fault: if this is retry fault + * + * retry fault - + * with CAM enabled, adev primary ring + * | gmc_v9_0_process_interrupt() + * adev soft_ring + * | gmc_v9_0_process_interrupt() worker failed to recover page fault + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * without CAM, adev primary ring1 + * | gmc_v9_0_process_interrupt worker failed to recvoer page fault + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * no-retry fault - + * adev primary ring + * | gmc_v9_0_process_interrupt() + * KFD node ih_fifo + * | KFD interrupt_wq worker + * kfd_signal_vm_fault_event + * + * fast path - After kfd_signal_vm_fault_event, gmc_v9_0_process_interrupt drop the page fault + * of same process, don't copy interrupt to KFD node ih_fifo. + * With gdb debugger enabled, need convert the retry fault to no-retry fault for + * debugger, cannot use the fast path. + * + * Return: + * true - use the fast path to handle this fault + * false - use normal path to handle it + */ +bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, + bool retry_fault) +{ + struct kfd_process *p; + u32 cam_index; + + if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) { + p = kfd_lookup_process_by_pasid(entry->pasid, NULL); + if (!p) + return true; + + if (p->gpu_page_fault && !p->debug_trap_enabled) { + if (retry_fault && adev->irq.retry_cam_enabled) { + cam_index = entry->src_data[2] & 0x3ff; + WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); + } + + kfd_unref_process(p); + return true; + } + + /* + * This is the first page fault, set flag and then signal user space + */ + p->gpu_page_fault = true; + kfd_unref_process(p); + } + return false; +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS @@ -1425,6 +1599,11 @@ int kfd_debugfs_hang_hws(struct kfd_node *dev) return -EINVAL; } + if (dev->kfd->shared_resources.enable_mes) { + dev_err(dev->adev->dev, "Inducing MES hang is not supported\n"); + return -EINVAL; + } + return dqm_debugfs_hang_hws(dev->dqm); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f4d395e38683..76359c6a3f3a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -35,12 +35,16 @@ #include "cik_regs.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" -#include "mes_api_def.h" +#include "amdgpu_reset.h" +#include "amdgpu_sdma.h" +#include "mes_v11_api_def.h" #include "kfd_debug.h" /* Size of the per-pipe EOP queue */ #define CIK_HPD_EOP_BYTES_LOG2 11 #define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2) +/* See unmap_queues_cpsch() */ +#define USE_DEFAULT_GRACE_PERIOD 0xffffffff static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, u32 pasid, unsigned int vmid); @@ -65,7 +69,8 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm, static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q); static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id); -static void kfd_process_hw_exception(struct work_struct *work); + +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) @@ -152,17 +157,24 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, static void kfd_hws_hang(struct device_queue_manager *dqm) { + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + /* Mark all device queues as reset. */ + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + pdd->has_reset_queue = true; + } + } + /* * Issue a GPU reset if HWS is unresponsive */ - dqm->is_hws_hang = true; - - /* It's possible we're detecting a HWS hang in the - * middle of a GPU reset. No need to schedule another - * reset in this case. - */ - if (!dqm->is_resetting) - schedule_work(&dqm->hw_exception_work); + amdgpu_amdkfd_gpu_reset(dqm->dev->adev); } static int convert_to_mes_queue_type(int queue_type) @@ -194,11 +206,13 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r, queue_type; uint64_t wptr_addr_off; - if (dqm->is_hws_hang) + if (!dqm->sched_running || dqm->sched_halt) + return 0; + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); - queue_input.process_id = qpd->pqm->process->pasid; + queue_input.process_id = pdd->pasid; queue_input.page_table_base_addr = qpd->page_table_base; queue_input.process_va_start = 0; queue_input.process_va_end = adev->vm_manager.max_pfn - 1; @@ -214,10 +228,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.mqd_addr = q->gart_mqd_addr; queue_input.wptr_addr = (uint64_t)q->properties.write_ptr; - if (q->wptr_bo) { - wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); - queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->wptr_bo) + wptr_addr_off; - } + wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); + queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->properties.wptr_bo) + wptr_addr_off; queue_input.is_kfd_process = 1; queue_input.is_aql_queue = (q->properties.format == KFD_QUEUE_FORMAT_AQL); @@ -236,6 +248,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (queue_type < 0) { dev_err(adev->dev, "Queue type not supported with MES, queue:%d\n", q->properties.type); + up_read(&adev->reset_domain->sem); return -EINVAL; } queue_input.queue_type = (uint32_t)queue_type; @@ -245,6 +258,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n", q->properties.doorbell_off); @@ -262,7 +276,9 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r; struct mes_remove_queue_input queue_input; - if (dqm->is_hws_hang) + if (!dqm->sched_running || dqm->sched_halt) + return 0; + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); @@ -272,6 +288,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", @@ -283,7 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_all_queues_mes(struct device_queue_manager *dqm) +static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; struct device *dev = dqm->dev->adev->dev; @@ -310,6 +327,73 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) +{ + struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q; + int retval = 0; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + retval = add_queue_mes(dqm, q, qpd); + if (retval) { + dev_err(dev, "%s: Failed to add queue %d for dev %d", + __func__, + q->properties.queue_id, + dqm->dev->id); + return retval; + } + } + } + + return retval; +} + +static int suspend_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r = 0; + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + + r = amdgpu_mes_suspend(adev); + up_read(&adev->reset_domain->sem); + + if (r) { + dev_err(adev->dev, "failed to suspend gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + +static int resume_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r = 0; + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + + r = amdgpu_mes_resume(adev); + up_read(&adev->reset_domain->sem); + + if (r) { + dev_err(adev->dev, "failed to resume gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + static void increment_queue_count(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -447,6 +531,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); struct device *dev = dqm->dev->adev->dev; int allocated_vmid = -1, i; @@ -465,9 +550,9 @@ static int allocate_vmid(struct device_queue_manager *dqm, pr_debug("vmid allocated: %d\n", allocated_vmid); - dqm->vmid_pasid[allocated_vmid] = q->process->pasid; + dqm->vmid_pasid[allocated_vmid] = pdd->pasid; - set_pasid_vmid_mapping(dqm, q->process->pasid, allocated_vmid); + set_pasid_vmid_mapping(dqm, pdd->pasid, allocated_vmid); qpd->vmid = allocated_vmid; q->properties.vmid = allocated_vmid; @@ -719,6 +804,11 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process return -EOPNOTSUPP; } + /* taking the VMID for that process on the safe way using PDD */ + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) + return -EFAULT; + /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. * ATC_VMID15_PASID_MAPPING * to check which VMID the current process is mapped to. @@ -728,23 +818,19 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info (dev->adev, vmid, &queried_pasid); - if (status && queried_pasid == p->pasid) { - pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n", - vmid, p->pasid); + if (status && queried_pasid == pdd->pasid) { + pr_debug("Killing wave fronts of vmid %d and process pid %d\n", + vmid, p->lead_thread->pid); break; } } if (vmid > last_vmid_to_scan) { - dev_err(dev->adev->dev, "Didn't find vmid for pasid 0x%x\n", p->pasid); + dev_err(dev->adev->dev, "Didn't find vmid for process pid %d\n", + p->lead_thread->pid); return -EFAULT; } - /* taking the VMID for that process on the safe way using PDD */ - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) - return -EFAULT; - reg_gfx_index.bits.sh_broadcast_writes = 1; reg_gfx_index.bits.se_broadcast_writes = 1; reg_gfx_index.bits.instance_broadcast_writes = 1; @@ -883,6 +969,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, else if (prev_active) retval = remove_queue_mes(dqm, q, &pdd->qpd); + /* queue is reset so inaccessable */ + if (pdd->has_reset_queue) { + retval = -EACCES; + goto out_unlock; + } + if (retval) { dev_err(dev, "unmap queue failed\n"); goto out_unlock; @@ -974,8 +1066,8 @@ static int suspend_single_queue(struct device_queue_manager *dqm, if (q->properties.is_suspended) return 0; - pr_debug("Suspending PASID %u queue [%i]\n", - pdd->process->pasid, + pr_debug("Suspending process pid %d queue [%i]\n", + pdd->process->lead_thread->pid, q->properties.queue_id); is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW); @@ -1022,8 +1114,8 @@ static int resume_single_queue(struct device_queue_manager *dqm, pdd = qpd_to_pdd(qpd); - pr_debug("Restoring from suspend PASID %u queue [%i]\n", - pdd->process->pasid, + pr_debug("Restoring from suspend process pid %d queue [%i]\n", + pdd->process->lead_thread->pid, q->properties.queue_id); q->properties.is_suspended = false; @@ -1056,8 +1148,8 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; pdd = qpd_to_pdd(qpd); - pr_debug_ratelimited("Evicting PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Evicting process pid %d queues\n", + pdd->process->lead_thread->pid); pdd->last_evict_timestamp = get_jiffies_64(); /* Mark all queues as evicted. Deactivate all active queues on @@ -1114,8 +1206,8 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, if (!pdd->drm_priv) goto out; - pr_debug_ratelimited("Evicting PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Evicting process pid %d queues\n", + pdd->process->lead_thread->pid); /* Mark all queues as evicted. Deactivate all active queues on * the qpd. @@ -1129,11 +1221,13 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, decrement_queue_count(dqm, qpd, q); if (dqm->dev->kfd->shared_resources.enable_mes) { - retval = remove_queue_mes(dqm, q, qpd); - if (retval) { + int err; + + err = remove_queue_mes(dqm, q, qpd); + if (err) { dev_err(dev, "Failed to evict queue %d\n", q->properties.queue_id); - goto out; + retval = err; } } } @@ -1173,8 +1267,8 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; } - pr_debug_ratelimited("Restoring PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Restoring process pid %d queues\n", + pdd->process->lead_thread->pid); /* Update PD Base in QPD */ qpd->page_table_base = pd_base; @@ -1257,8 +1351,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, if (!pdd->drm_priv) goto vm_not_acquired; - pr_debug_ratelimited("Restoring PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Restoring process pid %d queues\n", + pdd->process->lead_thread->pid); /* Update PD Base in QPD */ qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv); @@ -1468,20 +1562,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm) } if (dqm->dev->adev->asic_type == CHIP_HAWAII) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); dqm->sched_running = false; dqm_unlock(dqm); return 0; } -static void pre_reset(struct device_queue_manager *dqm) -{ - dqm_lock(dqm); - dqm->is_resetting = true; - dqm_unlock(dqm); -} - static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id) { @@ -1489,8 +1576,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, int bit; if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - dev_err(dev, "No more SDMA queue to allocate\n"); + if (bitmap_empty(dqm->sdma_bitmap, get_num_sdma_queues(dqm))) { + dev_warn(dev, "No more SDMA queue to allocate (%d total queues)\n", + get_num_sdma_queues(dqm)); return -ENOMEM; } @@ -1515,8 +1603,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, q->properties.sdma_queue_id = q->sdma_id / kfd_get_num_sdma_engines(dqm->dev); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { - if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - dev_err(dev, "No more XGMI SDMA queue to allocate\n"); + if (bitmap_empty(dqm->xgmi_sdma_bitmap, get_num_xgmi_sdma_queues(dqm))) { + dev_warn(dev, "No more XGMI SDMA queue to allocate (%d total queues)\n", + get_num_xgmi_sdma_queues(dqm)); return -ENOMEM; } if (restore_sdma_id) { @@ -1544,6 +1633,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev); q->properties.sdma_queue_id = q->sdma_id / kfd_get_num_xgmi_sdma_engines(dqm->dev); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { + int i, num_queues, num_engines, eng_offset = 0, start_engine; + bool free_bit_found = false, is_xgmi = false; + + if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) { + num_queues = get_num_sdma_queues(dqm); + num_engines = kfd_get_num_sdma_engines(dqm->dev); + q->properties.type = KFD_QUEUE_TYPE_SDMA; + } else { + num_queues = get_num_xgmi_sdma_queues(dqm); + num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev); + eng_offset = kfd_get_num_sdma_engines(dqm->dev); + q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI; + is_xgmi = true; + } + + /* Scan available bit based on target engine ID. */ + start_engine = q->properties.sdma_engine_id - eng_offset; + for (i = start_engine; i < num_queues; i += num_engines) { + + if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap)) + continue; + + clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap); + q->sdma_id = i; + q->properties.sdma_queue_id = q->sdma_id / num_engines; + free_bit_found = true; + break; + } + + if (!free_bit_found) { + dev_warn(dev, "No more SDMA queue to allocate for target ID %i (%d total queues)\n", + q->properties.sdma_engine_id, num_queues); + return -ENOMEM; + } } pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); @@ -1624,22 +1748,75 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->active_cp_queue_count = 0; dqm->gws_queue_count = 0; dqm->active_runlist = false; - INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); dqm->trap_debug_vmid = 0; init_sdma_bitmaps(dqm); - if (dqm->dev->kfd2kgd->get_iq_wait_times) - dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev, - &dqm->wait_times, - ffs(dqm->dev->xcc_mask) - 1); + update_dqm_wait_times(dqm); return 0; } +/* halt_cpsch: + * Unmap queues so the schedule doesn't continue remaining jobs in the queue. + * Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch + * is called. + */ +static int halt_cpsch(struct device_queue_manager *dqm) +{ + int ret = 0; + + dqm_lock(dqm); + if (!dqm->sched_running) { + dqm_unlock(dqm); + return 0; + } + + WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n"); + + if (!dqm->is_hws_hang) { + if (!dqm->dev->kfd->shared_resources.enable_mes) + ret = unmap_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, + USE_DEFAULT_GRACE_PERIOD, false); + else + ret = remove_all_kfd_queues_mes(dqm); + } + dqm->sched_halt = true; + dqm_unlock(dqm); + + return ret; +} + +/* unhalt_cpsch + * Unset dqm->sched_halt and map queues back to runlist + */ +static int unhalt_cpsch(struct device_queue_manager *dqm) +{ + int ret = 0; + + dqm_lock(dqm); + if (!dqm->sched_running || !dqm->sched_halt) { + WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n"); + dqm_unlock(dqm); + return 0; + } + dqm->sched_halt = false; + if (!dqm->dev->kfd->shared_resources.enable_mes) + ret = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + 0, USE_DEFAULT_GRACE_PERIOD); + else + ret = add_all_kfd_queues_mes(dqm); + + dqm_unlock(dqm); + + return ret; +} + static int start_cpsch(struct device_queue_manager *dqm) { struct device *dev = dqm->dev->adev->dev; - int retval; + int retval, num_hw_queue_slots; retval = 0; @@ -1669,38 +1846,37 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); /* clear hang status when driver try to start the hw scheduler */ - dqm->is_hws_hang = false; - dqm->is_resetting = false; dqm->sched_running = true; - if (!dqm->dev->kfd->shared_resources.enable_mes) + if (!dqm->dev->kfd->shared_resources.enable_mes) { + if (pm_config_dequeue_wait_counts(&dqm->packet_mgr, + KFD_DEQUEUE_WAIT_INIT, 0 /* unused */)) + dev_err(dev, "Setting optimized dequeue wait failed. Using default values\n"); execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); + } - /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */ - if (amdgpu_emu_mode == 0 && dqm->dev->adev->gmc.is_app_apu && - (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3))) { - uint32_t reg_offset = 0; - uint32_t grace_period = 1; + /* setup per-queue reset detection buffer */ + num_hw_queue_slots = dqm->dev->kfd->shared_resources.num_queue_per_pipe * + dqm->dev->kfd->shared_resources.num_pipe_per_mec * + NUM_XCC(dqm->dev->xcc_mask); - retval = pm_update_grace_period(&dqm->packet_mgr, - grace_period); - if (retval) - dev_err(dev, "Setting grace timeout failed\n"); - else if (dqm->dev->kfd2kgd->build_grace_period_packet_info) - /* Update dqm->wait_times maintained in software */ - dqm->dev->kfd2kgd->build_grace_period_packet_info( - dqm->dev->adev, dqm->wait_times, - grace_period, ®_offset, - &dqm->wait_times); + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info); + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL); + + if (!dqm->detect_hang_info) { + retval = -ENOMEM; + goto fail_detect_hang_buffer; } dqm_unlock(dqm); return 0; +fail_detect_hang_buffer: + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); fail_allocate_vidmem: fail_set_sched_resources: if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); fail_packet_manager_init: dqm_unlock(dqm); return retval; @@ -1708,22 +1884,17 @@ fail_packet_manager_init: static int stop_cpsch(struct device_queue_manager *dqm) { - bool hanging; - dqm_lock(dqm); if (!dqm->sched_running) { dqm_unlock(dqm); return 0; } - if (!dqm->is_hws_hang) { - if (!dqm->dev->kfd->shared_resources.enable_mes) - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); - else - remove_all_queues_mes(dqm); - } + if (!dqm->dev->kfd->shared_resources.enable_mes) + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); + else + remove_all_kfd_queues_mes(dqm); - hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1731,7 +1902,9 @@ static int stop_cpsch(struct device_queue_manager *dqm) kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, hanging); + pm_uninit(&dqm->packet_mgr); + kfree(dqm->detect_hang_info); + dqm->detect_hang_info = NULL; dqm_unlock(dqm); return 0; @@ -1803,7 +1976,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, } if (q->properties.type == KFD_QUEUE_TYPE_SDMA || - q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI || + q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { dqm_lock(dqm); retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL); dqm_unlock(dqm); @@ -1900,7 +2074,7 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, { unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies; struct device *dev = dqm->dev->adev->dev; - uint64_t *fence_addr = dqm->fence_addr; + uint64_t *fence_addr = dqm->fence_addr; while (*fence_addr != fence_value) { /* Fatal err detected, this response won't come */ @@ -1930,7 +2104,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) struct device *dev = dqm->dev->adev->dev; int retval; - if (!dqm->sched_running) + if (!dqm->sched_running || dqm->sched_halt) return 0; if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0) return 0; @@ -1948,7 +2122,240 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) return retval; } -/* dqm->lock mutex has to be locked before calling this function */ +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid %d is reset\n", + q->properties.queue_id, pdd->process->lead_thread->pid); + + pdd->has_reset_queue = true; + if (q->properties.is_active) { + q->properties.is_active = false; + decrement_queue_count(dqm, qpd, q); + } +} + +static int detect_queue_hang(struct device_queue_manager *dqm) +{ + int i; + + /* detect should be used only in dqm locked queue reset */ + if (WARN_ON(dqm->detect_hang_count > 0)) + return 0; + + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); + + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { + uint32_t mec, pipe, queue; + int xcc_id; + + mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe) + / dqm->dev->kfd->shared_resources.num_pipe_per_mec; + + if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap)) + continue; + + amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue); + + for_each_inst(xcc_id, dqm->dev->xcc_mask) { + uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr( + dqm->dev->adev, pipe, queue, xcc_id); + struct dqm_detect_hang_info hang_info; + + if (!queue_addr) + continue; + + hang_info.pipe_id = pipe; + hang_info.queue_id = queue; + hang_info.xcc_id = xcc_id; + hang_info.queue_address = queue_addr; + + dqm->detect_hang_info[dqm->detect_hang_count] = hang_info; + dqm->detect_hang_count++; + } + } + + return dqm->detect_hang_count; +} + +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (queue_address == q->properties.queue_address) + return q; + } + } + + return NULL; +} + +static int reset_hung_queues(struct device_queue_manager *dqm) +{ + int r = 0, reset_count = 0, i; + + if (!dqm->detect_hang_info || dqm->is_hws_hang) + return -EIO; + + /* assume dqm locked. */ + if (!detect_queue_hang(dqm)) + return -ENOTRECOVERABLE; + + for (i = 0; i < dqm->detect_hang_count; i++) { + struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i]; + struct queue *q = find_queue_by_address(dqm, hang_info.queue_address); + struct kfd_process_device *pdd; + uint64_t queue_addr = 0; + + if (!q) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + pdd = kfd_get_process_device_data(dqm->dev, q->process); + if (!pdd) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev, + hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id, + KFD_UNMAP_LATENCY_MS); + + /* either reset failed or we reset an unexpected queue. */ + if (queue_addr != q->properties.queue_address) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + set_queue_as_reset(dqm, q, &pdd->qpd); + reset_count++; + } + + if (reset_count == dqm->detect_hang_count) + kfd_signal_reset_event(dqm->dev); + else + r = -ENOTRECOVERABLE; + +reset_fail: + dqm->detect_hang_count = 0; + + return r; +} + +static bool sdma_has_hang(struct device_queue_manager *dqm) +{ + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); + int engine_end = engine_start + get_num_all_sdma_engines(dqm); + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; + int i, j; + + for (i = engine_start; i < engine_end; i++) { + for (j = 0; j < num_queues_per_eng; j++) { + if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j)) + continue; + + return true; + } + } + + return false; +} + +static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm, + uint32_t doorbell_off) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) && + q->properties.doorbell_off == doorbell_off) { + set_queue_as_reset(dqm, q, qpd); + return true; + } + } + } + + return false; +} + +static int reset_hung_queues_sdma(struct device_queue_manager *dqm) +{ + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); + int engine_end = engine_start + get_num_all_sdma_engines(dqm); + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; + int r = 0, i, j; + + if (dqm->is_hws_hang) + return -EIO; + + /* Scan for hung HW queues and reset engine. */ + dqm->detect_hang_count = 0; + for (i = engine_start; i < engine_end; i++) { + for (j = 0; j < num_queues_per_eng; j++) { + uint32_t doorbell_off = + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j); + + if (!doorbell_off) + continue; + + /* Reset engine and check. */ + if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) || + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) || + !set_sdma_queue_as_reset(dqm, doorbell_off)) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + /* Should only expect one queue active per engine */ + dqm->detect_hang_count++; + break; + } + } + + /* Signal process reset */ + if (dqm->detect_hang_count) + kfd_signal_reset_event(dqm->dev); + else + r = -ENOTRECOVERABLE; + +reset_fail: + dqm->detect_hang_count = 0; + + return r; +} + +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma) +{ + while (halt_if_hws_hang) + schedule(); + + if (!amdgpu_gpu_recovery) + return -ENOTRECOVERABLE; + + return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm); +} + +/* dqm->lock mutex has to be locked before calling this function + * + * @grace_period: If USE_DEFAULT_GRACE_PERIOD then default wait time + * for context switch latency. Lower values are used by debugger + * since context switching are triggered at high frequency. + * This is configured by setting CP_IQ_WAIT_TIME2.SCH_WAVE + * + */ static int unmap_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, uint32_t filter_param, @@ -1957,26 +2364,28 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; - int retval = 0; + int retval; if (!dqm->sched_running) return 0; - if (dqm->is_hws_hang || dqm->is_resetting) - return -EIO; if (!dqm->active_runlist) - return retval; + return 0; + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) + return -EIO; if (grace_period != USE_DEFAULT_GRACE_PERIOD) { - retval = pm_update_grace_period(&dqm->packet_mgr, grace_period); + retval = pm_config_dequeue_wait_counts(&dqm->packet_mgr, + KFD_DEQUEUE_WAIT_SET_SCH_WAVE, grace_period); if (retval) - return retval; + goto out; } retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset); if (retval) - return retval; + goto out; *dqm->fence_addr = KFD_FENCE_INIT; + mb(); pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ @@ -1985,7 +2394,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (retval) { dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); kfd_hws_hang(dqm); - return retval; + goto out; } /* In the current MEC firmware implementation, if compute queue @@ -1997,29 +2406,36 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, * check those fields */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - if (mqd_mgr->read_doorbell_id(dqm->packet_mgr.priv_queue->queue->mqd)) { - dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); - while (halt_if_hws_hang) - schedule(); - return -ETIME; - } + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) && + reset_queues_on_hws_hang(dqm, false)) + goto reset_fail; + + /* Check for SDMA hang and attempt SDMA reset */ + if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true)) + goto reset_fail; /* We need to reset the grace period value for this device */ if (grace_period != USE_DEFAULT_GRACE_PERIOD) { - if (pm_update_grace_period(&dqm->packet_mgr, - USE_DEFAULT_GRACE_PERIOD)) + if (pm_config_dequeue_wait_counts(&dqm->packet_mgr, + KFD_DEQUEUE_WAIT_RESET, 0 /* unused */)) dev_err(dev, "Failed to reset grace period\n"); } pm_release_ib(&dqm->packet_mgr); dqm->active_runlist = false; - +out: + up_read(&dqm->dev->adev->reset_domain->sem); return retval; + +reset_fail: + dqm->is_hws_hang = true; + kfd_hws_hang(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return -ETIME; } /* only for compute queue */ -static int reset_queues_cpsch(struct device_queue_manager *dqm, - uint16_t pasid) +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid) { int retval; @@ -2040,13 +2456,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; - if (dqm->is_hws_hang) + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false); - if (retval) - return retval; - - return map_queues_cpsch(dqm); + if (!retval) + retval = map_queues_cpsch(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return retval; } static int wait_on_destroy_queue(struct device_queue_manager *dqm, @@ -2056,6 +2472,9 @@ static int wait_on_destroy_queue(struct device_queue_manager *dqm, q->process); int ret = 0; + if (WARN_ON(!pdd)) + return ret; + if (pdd->qpd.is_debug) return ret; @@ -2125,10 +2544,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, pdd->sdma_past_activity_counter += sdma_val; } - list_del(&q->list); - qpd->queue_count--; if (q->properties.is_active) { decrement_queue_count(dqm, qpd, q); + q->properties.is_active = false; if (!dqm->dev->kfd->shared_resources.enable_mes) { retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, @@ -2139,6 +2557,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, retval = remove_queue_mes(dqm, q, qpd); } } + list_del(&q->list); + qpd->queue_count--; /* * Unconditionally decrement this counter, regardless of the queue's @@ -2168,20 +2588,13 @@ failed_try_destroy_debugged_queue: return retval; } -/* - * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to - * stay in user mode. - */ -#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL -/* APE1 limit is inclusive and 64K aligned. */ -#define APE1_LIMIT_ALIGNMENT 0xFFFF - static bool set_cache_memory_policy(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) + uint64_t alternate_aperture_size, + u32 misc_process_properties) { bool retval = true; @@ -2190,41 +2603,17 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, dqm_lock(dqm); - if (alternate_aperture_size == 0) { - /* base > limit disables APE1 */ - qpd->sh_mem_ape1_base = 1; - qpd->sh_mem_ape1_limit = 0; - } else { - /* - * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, - * SH_MEM_APE1_BASE[31:0], 0x0000 } - * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, - * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } - * Verify that the base and size parameters can be - * represented in this format and convert them. - * Additionally restrict APE1 to user-mode addresses. - */ - - uint64_t base = (uintptr_t)alternate_aperture_base; - uint64_t limit = base + alternate_aperture_size - 1; - - if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || - (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { - retval = false; - goto out; - } - - qpd->sh_mem_ape1_base = base >> 16; - qpd->sh_mem_ape1_limit = limit >> 16; - } - retval = dqm->asic_ops.set_cache_memory_policy( dqm, qpd, default_policy, alternate_policy, alternate_aperture_base, - alternate_aperture_size); + alternate_aperture_size, + misc_process_properties); + + if (retval) + goto out; if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) program_sh_mem_settings(dqm, qpd); @@ -2427,10 +2816,12 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, if (!dqm->dev->kfd->shared_resources.enable_mes) retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD); - if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { + if ((retval || qpd->reset_wavefronts) && + down_read_trylock(&dqm->dev->adev->reset_domain->sem)) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; + up_read(&dqm->dev->adev->reset_domain->sem); } /* Lastly, free mqd resources. @@ -2537,7 +2928,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) dqm->ops.initialize = initialize_cpsch; dqm->ops.start = start_cpsch; dqm->ops.stop = stop_cpsch; - dqm->ops.pre_reset = pre_reset; + dqm->ops.halt = halt_cpsch; + dqm->ops.unhalt = unhalt_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; dqm->ops.register_process = register_process; @@ -2558,7 +2950,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) /* initialize dqm for no cp scheduling */ dqm->ops.start = start_nocpsch; dqm->ops.stop = stop_nocpsch; - dqm->ops.pre_reset = pre_reset; dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; @@ -2597,7 +2988,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) break; default: - if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)) + device_queue_manager_init_v12(&dqm->asic_ops); + else if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) device_queue_manager_init_v11(&dqm->asic_ops); else if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1)) device_queue_manager_init_v10(&dqm->asic_ops); @@ -2633,7 +3026,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, { WARN(!mqd, "No hiq sdma mqd trunk to free"); - amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem); } void device_queue_manager_uninit(struct device_queue_manager *dqm) @@ -2645,28 +3038,112 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) kfree(dqm); } -int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id) { - struct kfd_process_device *pdd; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process_device *pdd = NULL; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, &pdd); + struct device_queue_manager *dqm = knode->dqm; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q = NULL; int ret = 0; - if (!p) + if (!pdd) return -EINVAL; - WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); - pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + + dqm_lock(dqm); + + if (pdd) { + qpd = &pdd->qpd; + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->doorbell_id == doorbell_id && q->properties.is_active) { + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + + q->properties.is_evicted = true; + q->properties.is_active = false; + decrement_queue_count(dqm, qpd, q); + + ret = remove_queue_mes(dqm, q, qpd); + if (ret) { + dev_err(dev, "Removing bad queue failed"); + goto out; + } + + ret = resume_all_queues_mes(dqm); + if (ret) + dev_err(dev, "Resuming all queues failed"); + + break; + } + } + } + +out: + dqm_unlock(dqm); kfd_unref_process(p); + return ret; +} +static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct device *dev = dqm->dev->adev->dev; + int ret = 0; + + /* Check if process is already evicted */ + dqm_lock(dqm); + if (qpd->evicted) { + /* Increment the evicted count to make sure the + * process stays evicted before its terminated. + */ + qpd->evicted++; + dqm_unlock(dqm); + goto out; + } + dqm_unlock(dqm); + + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + + ret = dqm->ops.evict_process_queues(dqm, qpd); + if (ret) { + dev_err(dev, "Evicting process queues failed"); + goto out; + } + + ret = resume_all_queues_mes(dqm); + if (ret) + dev_err(dev, "Resuming all queues failed"); + +out: return ret; } -static void kfd_process_hw_exception(struct work_struct *work) +int kfd_evict_process_device(struct kfd_process_device *pdd) { - struct device_queue_manager *dqm = container_of(work, - struct device_queue_manager, hw_exception_work); - amdgpu_amdkfd_gpu_reset(dqm->dev->adev); + struct device_queue_manager *dqm; + struct kfd_process *p; + int ret = 0; + + p = pdd->process; + dqm = pdd->dev->dqm; + + WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); + + if (dqm->dev->kfd->shared_resources.enable_mes) + ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); + else + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + + return ret; } int reserve_debug_trap_vmid(struct device_queue_manager *dqm, @@ -2792,7 +3269,7 @@ struct copy_context_work_handler_workarea { struct kfd_process *p; }; -static void copy_context_work_handler (struct work_struct *work) +static void copy_context_work_handler(struct work_struct *work) { struct copy_context_work_handler_workarea *workarea; struct mqd_manager *mqd_mgr; @@ -2819,6 +3296,9 @@ static void copy_context_work_handler (struct work_struct *work) struct qcm_process_device *qpd = &pdd->qpd; list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE) + continue; + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP]; /* We ignore the return value from get_wave_state @@ -3159,6 +3639,30 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) return debug_map_and_unlock(dqm); } +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off, u32 *queue_format) +{ + struct queue *q; + bool r = false; + + if (!queue_format) + return r; + + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.doorbell_off == doorbell_off) { + *queue_format = q->properties.format; + r = true; + goto out; + } + } + +out: + dqm_unlock(dqm); + return r; +} #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index cf7e182588f8..74a61b5b2f0b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -37,7 +37,6 @@ #define KFD_MES_PROCESS_QUANTUM 100000 #define KFD_MES_GANG_QUANTUM 10000 -#define USE_DEFAULT_GRACE_PERIOD 0xffffffff struct device_process_node { struct qcm_process_device *qpd; @@ -106,6 +105,12 @@ union GRBM_GFX_INDEX_BITS { * @uninitialize: Destroys all the device queue manager resources allocated in * initialize routine. * + * @halt: This routine unmaps queues from runlist and set halt status to true + * so no more queues will be mapped to runlist until unhalt. + * + * @unhalt: This routine unset halt status to flase and maps queues back to + * runlist. + * * @create_kernel_queue: Creates kernel queue. Used for debug queue. * * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue. @@ -152,8 +157,9 @@ struct device_queue_manager_ops { int (*initialize)(struct device_queue_manager *dqm); int (*start)(struct device_queue_manager *dqm); int (*stop)(struct device_queue_manager *dqm); - void (*pre_reset)(struct device_queue_manager *dqm); void (*uninitialize)(struct device_queue_manager *dqm); + int (*halt)(struct device_queue_manager *dqm); + int (*unhalt)(struct device_queue_manager *dqm); int (*create_kernel_queue)(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd); @@ -167,7 +173,8 @@ struct device_queue_manager_ops { enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); int (*process_termination)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); @@ -203,7 +210,8 @@ struct device_queue_manager_asic_ops { enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); void (*init_sdma_vm)(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); @@ -211,6 +219,13 @@ struct device_queue_manager_asic_ops { struct kfd_node *dev); }; +struct dqm_detect_hang_info { + int pipe_id; + int queue_id; + int xcc_id; + uint64_t queue_address; +}; + /** * struct device_queue_manager * @@ -255,9 +270,9 @@ struct device_queue_manager { /* hw exception */ bool is_hws_hang; bool is_resetting; - struct work_struct hw_exception_work; struct kfd_mem_obj hiq_sdma_mqd; bool sched_running; + bool sched_halt; /* used for GFX 9.4.3 only */ uint32_t current_logical_xcc_start; @@ -265,6 +280,11 @@ struct device_queue_manager { uint32_t wait_times; wait_queue_head_t destroy_wait; + + /* for per-queue reset support */ + struct dqm_detect_hang_info *detect_hang_info; + size_t detect_hang_info_size; + int detect_hang_count; }; void device_queue_manager_init_cik( @@ -277,6 +297,8 @@ void device_queue_manager_init_v10( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v11( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v12( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_cp_queues_num(struct device_queue_manager *dqm); @@ -302,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q, int debug_lock_and_unmap(struct device_queue_manager *dqm); int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off, u32 *queue_format); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { @@ -334,4 +359,14 @@ static inline int read_sdma_queue_counter(uint64_t __user *q_rptr, uint64_t *val /* SDMA activity counter is stored at queue's RPTR + 0x8 location. */ return get_user(*val, q_rptr + 1); } + +static inline void update_dqm_wait_times(struct device_queue_manager *dqm) +{ + if (dqm->dev->kfd2kgd->get_iq_wait_times) + dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev, + &dqm->wait_times, + ffs(dqm->dev->xcc_mask) - 1); +} + + #endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index d4d95c7f2e5d..0508ef5a41d7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -27,12 +27,21 @@ #include "oss/oss_2_4_sh_mask.h" #include "gca/gfx_7_2_sh_mask.h" +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); static int update_qpd_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm(struct device_queue_manager *dqm, @@ -80,10 +89,41 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) + uint64_t alternate_aperture_size, + u32 misc_process_properties) { uint32_t default_mtype; uint32_t ape1_mtype; + unsigned int temp; + bool retval = true; + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || + (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { + retval = false; + goto out; + } + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } default_mtype = (default_policy == cache_policy_coherent) ? MTYPE_NONCACHED : @@ -97,37 +137,22 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, | ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | DEFAULT_MTYPE(default_mtype) | APE1_MTYPE(ape1_mtype); - - return true; -} - -static int update_qpd_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | - DEFAULT_MTYPE(MTYPE_NONCACHED) | - APE1_MTYPE(MTYPE_NONCACHED); - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit * aperture addresses. */ - temp = get_sh_mem_bases_nybble_64(pdd); + temp = get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd)); qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); +out: + return retval; +} + +static int update_qpd_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c index 245a90dfc2f6..ba6e3d747ccd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c @@ -31,10 +31,18 @@ static int update_qpd_v10(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v10( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v10; asic_ops->update_qpd = update_qpd_v10; asic_ops->init_sdma_vm = init_sdma_vm_v10; asic_ops->mqd_manager_init = mqd_manager_init_v10; @@ -49,27 +57,28 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } -static int update_qpd_v10(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) +static bool set_cache_memory_policy_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) { - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - (SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | - (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + return true; +} +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c index 2e129da7acb4..8b447d04558f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c @@ -30,10 +30,18 @@ static int update_qpd_v11(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v11(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v11(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v11( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v11; asic_ops->update_qpd = update_qpd_v11; asic_ops->init_sdma_vm = init_sdma_vm_v11; asic_ops->mqd_manager_init = mqd_manager_init_v11; @@ -48,28 +56,29 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } -static int update_qpd_v11(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) +static bool set_cache_memory_policy_v11(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) { - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - (SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | - (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } + qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + return true; +} +static int update_qpd_v11(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c new file mode 100644 index 000000000000..3550da3a46f9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "gc/gc_12_0_0_sh_mask.h" +#include "soc24_enum.h" + +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); + +void device_queue_manager_init_v12( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v12; + asic_ops->update_qpd = update_qpd_v12; + asic_ops->init_sdma_vm = init_sdma_vm_v12; + asic_ops->mqd_manager_init = mqd_manager_init_v12; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static bool set_cache_memory_policy_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) +{ + qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + return true; +} + +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + return 0; +} + +static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 54eb1bff903c..9fcc8c6e57b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -30,10 +30,18 @@ static int update_qpd_v9(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v9( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v9; asic_ops->update_qpd = update_qpd_v9; asic_ops->init_sdma_vm = init_sdma_vm_v9; asic_ops->mqd_manager_init = mqd_manager_init_v9; @@ -48,10 +56,42 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } +static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) +{ + qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; + + if (dqm->dev->kfd->noretry) + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; + + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4)) + qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); + + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) { + if (misc_process_properties & KFD_PROC_FLAG_MFMA_HIGH_PRECISION) + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRECISION_MODE__SHIFT; + } + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); + + pr_debug("sh_mem_bases 0x%X sh_mem_config 0x%X\n", qpd->sh_mem_bases, + qpd->sh_mem_config); + return true; +} + static int update_qpd_v9(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { - struct kfd_process_device *pdd; + struct kfd_process_device *pdd = qpd_to_pdd(qpd); pdd = qpd_to_pdd(qpd); @@ -63,7 +103,8 @@ static int update_qpd_v9(struct device_queue_manager *dqm, if (dqm->dev->kfd->noretry) qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; - if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4)) qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index b291ee0fab94..dad83356e976 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -27,12 +27,21 @@ #include "gca/gfx_8_0_sh_mask.h" #include "oss/oss_3_0_sh_mask.h" +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); static int update_qpd_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm(struct device_queue_manager *dqm, @@ -81,10 +90,41 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) + uint64_t alternate_aperture_size, + u32 misc_process_properties) { uint32_t default_mtype; uint32_t ape1_mtype; + unsigned int temp; + bool retval = true; + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || + (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { + retval = false; + goto out; + } + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } default_mtype = (default_policy == cache_policy_coherent) ? MTYPE_UC : @@ -100,40 +140,21 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - return true; -} - -static int update_qpd_vi(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit * aperture addresses. */ - temp = get_sh_mem_bases_nybble_64(pdd); + temp = get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd)); qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", temp, qpd->sh_mem_bases); +out: + return retval; +} +static int update_qpd_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 9b33d9d2c9ad..2b294ada3ec0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -31,6 +31,7 @@ #include <linux/memory.h> #include "kfd_priv.h" #include "kfd_events.h" +#include "kfd_device_queue_manager.h" #include <linux/device.h> /* @@ -726,7 +727,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -747,6 +748,16 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, uint64_t *slots = page_slots(p->signal_page); uint32_t id; + /* + * If id is valid but slot is not signaled, GPU may signal the same event twice + * before driver have chance to process the first interrupt, then signal slot is + * auto-reset after set_event wakeup the user space, just drop the second event as + * the application only need wakeup once. + */ + if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) && + partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT) + goto out_unlock; + if (valid_id_bits) pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", partial_id, valid_id_bits); @@ -775,6 +786,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, } } +out_unlock: rcu_read_unlock(); kfd_unref_process(p); } @@ -1127,8 +1139,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGSEGV to process pid %d", + p->lead_thread->pid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -1136,13 +1148,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGTERM to process pid %d", + p->lead_thread->pid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "Process %d (pasid 0x%x) got unhandled exception", - p->lead_thread->pid, p->pasid); + "Process pid %d got unhandled exception", + p->lead_thread->pid); } } @@ -1156,7 +1168,7 @@ void kfd_signal_hw_exception_event(u32 pasid) * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -1165,22 +1177,39 @@ void kfd_signal_hw_exception_event(u32 pasid) kfd_unref_process(p); } -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va) +{ + struct kfd_process_device *pdd; + struct kfd_hsa_memory_exception_data exception_data; + int i; + + memset(&exception_data, 0, sizeof(exception_data)); + exception_data.va = gpu_va; + exception_data.failure.NotPresent = 1; + + // Send VM seg fault to all kfd process device + for (i = 0; i < p->n_pdds; i++) { + pdd = p->pdds[i]; + exception_data.gpu_id = pdd->user_gpu_id; + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, &exception_data); + } +} + +void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, struct kfd_vm_fault_info *info, struct kfd_hsa_memory_exception_data *data) { struct kfd_event *ev; uint32_t id; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = pdd->process; struct kfd_hsa_memory_exception_data memory_exception_data; int user_gpu_id; - if (!p) - return; /* Presumably process exited. */ - - user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id); if (unlikely(user_gpu_id == -EINVAL)) { - WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", + pdd->dev->id); return; } @@ -1217,7 +1246,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, } rcu_read_unlock(); - kfd_unref_process(p); } void kfd_signal_reset_event(struct kfd_node *dev) @@ -1244,12 +1272,41 @@ void kfd_signal_reset_event(struct kfd_node *dev) idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); continue; } + if (unlikely(!pdd)) { + WARN_ONCE(1, "Could not get device data from process pid:%d\n", + p->lead_thread->pid); + continue; + } + + if (dev->dqm->detect_hang_count && !pdd->has_reset_queue) + continue; + + if (dev->dqm->detect_hang_count) { + struct amdgpu_task_info *ti; + struct amdgpu_fpriv *drv_priv; + + if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) { + WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n", + dev->id, p->lead_thread->pid); + continue; + } + + ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm); + if (ti) { + dev_err(dev->adev->dev, + "Queues reset on process %s tid %d thread %s pid %d\n", + ti->process_name, ti->tgid, ti->task_name, ti->pid); + amdgpu_vm_put_task_info(ti); + } + } + rcu_read_lock(); id = KFD_FIRST_NONSIGNAL_EVENT_ID; @@ -1278,7 +1335,7 @@ void kfd_signal_reset_event(struct kfd_node *dev) void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) { - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; struct kfd_event *ev; @@ -1293,6 +1350,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + kfd_unref_process(p); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 4a64307bc438..dbcb60eb54b2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -380,7 +380,8 @@ int kfd_init_apertures(struct kfd_process *process) pdd = kfd_create_process_device_data(dev, process); if (!pdd) { - pr_err("Failed to create process device data\n"); + dev_err(dev->adev->dev, + "Failed to create process device data\n"); return -ENOMEM; } /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 9a06c6fb6605..3e1ad8974797 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -129,60 +129,6 @@ enum SQ_INTERRUPT_ERROR_TYPE { KFD_DEBUG_CP_BAD_OP_ECODE_MASK) \ >> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT) -static void event_interrupt_poison_consumption(struct kfd_node *dev, - uint16_t pasid, uint16_t client_id) -{ - enum amdgpu_ras_block block = 0; - int old_poison, ret = -EINVAL; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - - if (!p) - return; - - /* all queues of a process will be unmapped in one time */ - old_poison = atomic_cmpxchg(&p->poison, 0, 1); - kfd_unref_process(p); - if (old_poison) - return; - - switch (client_id) { - case SOC15_IH_CLIENTID_SE0SH: - case SOC15_IH_CLIENTID_SE1SH: - case SOC15_IH_CLIENTID_SE2SH: - case SOC15_IH_CLIENTID_SE3SH: - case SOC15_IH_CLIENTID_UTCL2: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); - block = AMDGPU_RAS_BLOCK__GFX; - break; - case SOC15_IH_CLIENTID_SDMA0: - case SOC15_IH_CLIENTID_SDMA1: - case SOC15_IH_CLIENTID_SDMA2: - case SOC15_IH_CLIENTID_SDMA3: - case SOC15_IH_CLIENTID_SDMA4: - block = AMDGPU_RAS_BLOCK__SDMA; - break; - default: - break; - } - - kfd_signal_poison_consumed_event(dev, pasid); - - /* resetting queue passes, do page retirement without gpu reset - * resetting queue fails, fallback to gpu reset solution - */ - if (!ret) { - dev_warn(dev->adev->dev, - "RAS poison consumption, unmap queue flow succeeded: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); - } else { - dev_warn(dev->adev->dev, - "RAS poison consumption, fall back to gpu reset flow: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); - } -} - static bool event_interrupt_isr_v10(struct kfd_node *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, @@ -222,14 +168,14 @@ static bool event_interrupt_isr_v10(struct kfd_node *dev, client_id != SOC15_IH_CLIENTID_SE3SH) return false; - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); - /* If there is no valid PASID, it's likely a bug */ - if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + if (pasid == 0) return 0; /* Interrupt types we care about: various signals and faults. @@ -271,37 +217,66 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_BUF0_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_BUF1_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_AUTO_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + WLT), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF0_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF1_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SA_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - WGP_ID)); + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SA_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SIMD_ID), + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID)); if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK) { if (kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), @@ -313,33 +288,45 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0, ERR_TYPE); - pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SA_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - WGP_ID), + dev_warn_ratelimited( + dev->adev->dev, + "sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SA_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SIMD_ID), + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID), sq_intr_err_type); - if (sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && - sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { - event_interrupt_poison_consumption(dev, pasid, source_id); - return; - } break; default: break; } kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), @@ -358,9 +345,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SDMA7) { if (source_id == SOC15_INTSRC_SDMA_TRAP) { kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); - } else if (source_id == SOC15_INTSRC_SDMA_ECC) { - event_interrupt_poison_consumption(dev, pasid, source_id); - return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || client_id == SOC15_IH_CLIENTID_VMC1 || @@ -369,12 +353,6 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; - if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { - event_interrupt_poison_consumption(dev, pasid, client_id); - return; - } - info.vmid = vmid; info.mc_id = client_id; info.page_addr = ih_ring_entry[4] | diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 7e2859736a55..2788a52714d1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -148,44 +148,69 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \ KFD_CTXID0_DOORBELL_ID_MASK) -static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF_FULL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + REG_TIMESTAMP), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + CMD_TIMESTAMP), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + HOST_CMD_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + HOST_REG_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + IMMED_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_UTC_ERROR)); } -static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, + SH_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, + SIMD_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID)); } -static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_warn_ratelimited( + dev_warn_ratelimited( + dev->adev->dev, "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + DETAIL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + TYPE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + SH_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + PRIV), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + WAVE_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, + SIMD_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, + WGP_ID)); } static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, @@ -193,7 +218,8 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, { enum amdgpu_ras_block block = 0; int ret = -EINVAL; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + uint32_t reset = 0; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; @@ -212,10 +238,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, if (dev->dqm->ops.reset_queues) ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC21_INTSRC_SDMA_ECC: default: block = AMDGPU_RAS_BLOCK__GFX; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; } @@ -223,10 +252,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, /* resetting queue passes, do page retirement without gpu reset resetting queue fails, fallback to gpu reset solution */ - if (!ret) - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); - else - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } static bool event_interrupt_isr_v11(struct kfd_node *dev, @@ -254,14 +280,14 @@ static bool event_interrupt_isr_v11(struct kfd_node *dev, (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG)) return false; - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); - /* If there is no valid PASID, it's likely a bug */ - if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + if (pasid == 0) return false; /* Interrupt types we care about: various signals and faults. @@ -328,11 +354,15 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, /* CP */ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) kfd_signal_event_interrupt(pasid, context_id0, 32); - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) - kfd_set_dbg_ev_from_interrupt(dev, pasid, - KFD_CTXID0_DOORBELL_ID(context_id0), + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { + u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); + + kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id, KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), NULL, 0); + kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id); + } /* SDMA */ else if (source_id == SOC21_INTSRC_SDMA_TRAP) @@ -348,10 +378,10 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (sq_int_enc) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - print_sq_intr_info_auto(context_id0, context_id1); + print_sq_intr_info_auto(dev, context_id0, context_id1); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - print_sq_intr_info_inst(context_id0, context_id1); + print_sq_intr_info_inst(dev, context_id0, context_id1); sq_int_priv = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV); if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid, @@ -361,7 +391,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, return; break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: - print_sq_intr_info_error(context_id0, context_id1); + print_sq_intr_info_error(dev, context_id0, context_id1); sq_int_errtype = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE); if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 91dd5e045b51..4ceb251312a6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -27,6 +27,7 @@ #include "soc15_int.h" #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" +#include "amdgpu_ras.h" /* * GFX9 SQ Interrupts @@ -144,8 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { enum amdgpu_ras_block block = 0; - int old_poison, ret = -EINVAL; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + uint32_t reset = 0; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + u64 event_id; + int old_poison, ret; if (!p) return; @@ -162,8 +166,30 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE2SH: case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__GFX; + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x00557300 and onwards */ + if (dev->adev->pm.fw_version < 0x00557300) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x05550C00 and onwards */ + if (dev->adev->pm.fw_version < 0x05550C00) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else { + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX); + break; + case SOC15_IH_CLIENTID_VMC: + case SOC15_IH_CLIENTID_VMC1: + block = AMDGPU_RAS_BLOCK__MMHUB; + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: @@ -171,27 +197,44 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: block = AMDGPU_RAS_BLOCK__SDMA; + if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x00557300 and onwards */ + if (dev->adev->pm.fw_version < 0x00557300) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5)) { + /* driver mode-2 for gfx poison is only supported by + * pmfw 0x05550C00 and onwards */ + if (dev->adev->pm.fw_version < 0x05550C00) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } else { + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA); break; default: - break; + dev_warn(dev->adev->dev, + "client %d does not support poison consumption\n", client_id); + return; } + ret = amdgpu_ras_mark_ras_event(dev->adev, type); + if (ret) + return; + kfd_signal_poison_consumed_event(dev, pasid); - /* resetting queue passes, do page retirement without gpu reset - * resetting queue fails, fallback to gpu reset solution - */ - if (!ret) { - dev_warn(dev->adev->dev, - "RAS poison consumption, unmap queue flow succeeded: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); - } else { - dev_warn(dev->adev->dev, - "RAS poison consumption, fall back to gpu reset flow: client id %d\n", - client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); - } + event_id = amdgpu_ras_acquire_event_id(dev->adev, type); + + RAS_EVENT_LOG(dev->adev, event_id, + "poison is consumed by client %d, kick off gpu reset flow\n", client_id); + + amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, + block, pasid, NULL, NULL, reset); } static bool context_id_expected(struct kfd_dev *dev) @@ -271,11 +314,12 @@ static bool event_interrupt_isr_v9(struct kfd_node *dev, & ~pasid_mask) | pasid); } - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); /* If there is no valid PASID, it's likely a bug */ if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) @@ -336,28 +380,82 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_BUF_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, REG_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, CMD_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_CMD_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_REG_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, IMMED_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + THREAD_TRACE), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + WLT), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + THREAD_TRACE_BUF_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + REG_TIMESTAMP), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + CMD_TIMESTAMP), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + HOST_CMD_OVERFLOW), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + HOST_REG_OVERFLOW), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + IMMED_OVERFLOW), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SH_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SIMD_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + CU_ID), sq_int_data); if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) { if (kfd_set_dbg_ev_from_interrupt(dev, pasid, @@ -369,14 +467,37 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE); - pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), + dev_warn_ratelimited( + dev->adev->dev, + "sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SH_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SIMD_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + CU_ID), sq_intr_err); if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { @@ -388,7 +509,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; } kfd_signal_event_interrupt(pasid, sq_int_data, 24); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), @@ -415,8 +537,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); struct kfd_hsa_memory_exception_data exception_data; - if (client_id == SOC15_IH_CLIENTID_UTCL2 && - amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + if (source_id == SOC15_INTSRC_VMC_UTCL2_POISON) { event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index dd3c43c1ad70..783c2f5a04e4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -46,7 +46,7 @@ #include <linux/kfifo.h> #include "kfd_priv.h" -#define KFD_IH_NUM_ENTRIES 8192 +#define KFD_IH_NUM_ENTRIES 16384 static void interrupt_wq(struct work_struct *); @@ -62,11 +62,14 @@ int kfd_interrupt_init(struct kfd_node *node) return r; } - node->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); - if (unlikely(!node->ih_wq)) { - kfifo_free(&node->ih_fifo); - dev_err(node->adev->dev, "Failed to allocate KFD IH workqueue\n"); - return -ENOMEM; + if (!node->kfd->ih_wq) { + node->kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI | WQ_UNBOUND, + node->kfd->num_nodes); + if (unlikely(!node->kfd->ih_wq)) { + kfifo_free(&node->ih_fifo); + dev_err(node->adev->dev, "Failed to allocate KFD IH workqueue\n"); + return -ENOMEM; + } } spin_lock_init(&node->interrupt_lock); @@ -96,14 +99,6 @@ void kfd_interrupt_exit(struct kfd_node *node) spin_lock_irqsave(&node->interrupt_lock, flags); node->interrupts_active = false; spin_unlock_irqrestore(&node->interrupt_lock, flags); - - /* - * flush_work ensures that there are no outstanding - * work-queue items that will access interrupt_ring. New work items - * can't be created because we stopped interrupt handling above. - */ - flush_workqueue(node->ih_wq); - kfifo_free(&node->ih_fifo); } @@ -112,55 +107,48 @@ void kfd_interrupt_exit(struct kfd_node *node) */ bool enqueue_ih_ring_entry(struct kfd_node *node, const void *ih_ring_entry) { - int count; - - count = kfifo_in(&node->ih_fifo, ih_ring_entry, - node->kfd->device_info.ih_ring_entry_size); - if (count != node->kfd->device_info.ih_ring_entry_size) { - dev_dbg_ratelimited(node->adev->dev, - "Interrupt ring overflow, dropping interrupt %d\n", - count); + if (kfifo_is_full(&node->ih_fifo)) { + dev_warn_ratelimited(node->adev->dev, "KFD node %d ih_fifo overflow\n", + node->node_id); return false; } + kfifo_in(&node->ih_fifo, ih_ring_entry, node->kfd->device_info.ih_ring_entry_size); return true; } /* * Assumption: single reader/writer. This function is not re-entrant */ -static bool dequeue_ih_ring_entry(struct kfd_node *node, void *ih_ring_entry) +static bool dequeue_ih_ring_entry(struct kfd_node *node, u32 **ih_ring_entry) { int count; - count = kfifo_out(&node->ih_fifo, ih_ring_entry, - node->kfd->device_info.ih_ring_entry_size); - - WARN_ON(count && count != node->kfd->device_info.ih_ring_entry_size); + if (kfifo_is_empty(&node->ih_fifo)) + return false; + count = kfifo_out_linear_ptr(&node->ih_fifo, ih_ring_entry, + node->kfd->device_info.ih_ring_entry_size); + WARN_ON(count != node->kfd->device_info.ih_ring_entry_size); return count == node->kfd->device_info.ih_ring_entry_size; } static void interrupt_wq(struct work_struct *work) { - struct kfd_node *dev = container_of(work, struct kfd_node, - interrupt_work); - uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE]; + struct kfd_node *dev = container_of(work, struct kfd_node, interrupt_work); + uint32_t *ih_ring_entry; unsigned long start_jiffies = jiffies; - if (dev->kfd->device_info.ih_ring_entry_size > sizeof(ih_ring_entry)) { - dev_err_once(dev->adev->dev, "Ring entry too small\n"); - return; - } - - while (dequeue_ih_ring_entry(dev, ih_ring_entry)) { + while (dequeue_ih_ring_entry(dev, &ih_ring_entry)) { dev->kfd->device_info.event_interrupt_class->interrupt_wq(dev, ih_ring_entry); + kfifo_skip_count(&dev->ih_fifo, dev->kfd->device_info.ih_ring_entry_size); + if (time_is_before_jiffies(start_jiffies + HZ)) { /* If we spent more than a second processing signals, * reschedule the worker to avoid soft-lockup warnings */ - queue_work(dev->ih_wq, &dev->interrupt_work); + queue_work(dev->kfd->ih_wq, &dev->interrupt_work); break; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 32c926986dbb..2b0a830f5b29 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -32,6 +32,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers.h" #include "kfd_pm4_opcodes.h" +#include "amdgpu_reset.h" #define PM4_COUNT_ZERO (((1 << 15) - 1) << 16) @@ -68,7 +69,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; break; default: - pr_err("Invalid queue type %d\n", type); + dev_err(dev->adev->dev, "Invalid queue type %d\n", type); return false; } @@ -78,13 +79,14 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, prop.doorbell_ptr = kfd_get_kernel_doorbell(dev->kfd, &prop.doorbell_off); if (!prop.doorbell_ptr) { - pr_err("Failed to initialize doorbell"); + dev_err(dev->adev->dev, "Failed to initialize doorbell"); goto err_get_kernel_doorbell; } retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); if (retval != 0) { - pr_err("Failed to init pq queues size %d\n", queue_size); + dev_err(dev->adev->dev, "Failed to init pq queues size %d\n", + queue_size); goto err_pq_allocate_vidmem; } @@ -123,7 +125,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, memset(kq->pq_kernel_addr, 0, queue_size); memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); - memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel)); + memset(kq->wptr_kernel, 0, dev->kfd->device_info.doorbell_size); prop.queue_size = queue_size; prop.is_interop = false; @@ -196,15 +198,17 @@ err_get_kernel_doorbell: } /* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq, bool hanging) +static void kq_uninitialize(struct kernel_queue *kq) { - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging) + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && down_read_trylock(&kq->dev->adev->reset_domain->sem)) { kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, kq->queue->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, kq->queue->pipe, kq->queue->queue); + up_read(&kq->dev->adev->reset_domain->sem); + } else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); @@ -302,12 +306,17 @@ int kq_submit_packet(struct kernel_queue *kq) if (amdgpu_amdkfd_is_fed(kq->dev->adev)) return -EIO; + /* Make sure ring buffer is updated before wptr updated */ + mb(); + if (kq->dev->kfd->device_info.doorbell_size == 8) { *kq->wptr64_kernel = kq->pending_wptr64; + mb(); /* Make sure wptr updated before ring doorbell */ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, kq->pending_wptr64); } else { *kq->wptr_kernel = kq->pending_wptr; + mb(); /* Make sure wptr updated before ring doorbell */ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, kq->pending_wptr); } @@ -338,15 +347,15 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev, if (kq_initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) return kq; - pr_err("Failed to init kernel queue\n"); + dev_err(dev->adev->dev, "Failed to init kernel queue\n"); kfree(kq); return NULL; } -void kernel_queue_uninit(struct kernel_queue *kq, bool hanging) +void kernel_queue_uninit(struct kernel_queue *kq) { - kq_uninitialize(kq, hanging); + kq_uninitialize(kq); kfree(kq); } @@ -357,26 +366,26 @@ static __attribute__((unused)) void test_kq(struct kfd_node *dev) uint32_t *buffer, i; int retval; - pr_err("Starting kernel queue test\n"); + dev_err(dev->adev->dev, "Starting kernel queue test\n"); kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); if (unlikely(!kq)) { - pr_err(" Failed to initialize HIQ\n"); - pr_err("Kernel queue test failed\n"); + dev_err(dev->adev->dev, " Failed to initialize HIQ\n"); + dev_err(dev->adev->dev, "Kernel queue test failed\n"); return; } retval = kq_acquire_packet_buffer(kq, 5, &buffer); if (unlikely(retval != 0)) { - pr_err(" Failed to acquire packet buffer\n"); - pr_err("Kernel queue test failed\n"); + dev_err(dev->adev->dev, " Failed to acquire packet buffer\n"); + dev_err(dev->adev->dev, "Kernel queue test failed\n"); return; } for (i = 0; i < 5; i++) buffer[i] = kq->nop_packet; kq_submit_packet(kq); - pr_err("Ending kernel queue test\n"); + dev_err(dev->adev->dev, "Ending kernel queue test\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index bdc01ca9609a..79251f22b702 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -77,7 +77,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo); amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, - dst_addr, num_bytes, false); + dst_addr, num_bytes, 0); amdgpu_ring_pad_ib(ring, &job->ibs[0]); WARN_ON(job->ibs[0].length_dw > num_dw); @@ -153,7 +153,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, } r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE, - NULL, &next, false, true, false); + NULL, &next, false, true, 0); if (r) { dev_err(adev->dev, "fail %d to copy memory\n", r); goto out_unlock; @@ -278,10 +278,11 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, dma_addr_t *scratch, uint64_t ttm_res_offset) { - uint64_t npages = migrate->cpages; + uint64_t npages = migrate->npages; struct amdgpu_device *adev = node->adev; struct device *dev = adev->dev; struct amdgpu_res_cursor cursor; + uint64_t mpages = 0; dma_addr_t *src; uint64_t *dst; uint64_t i, j; @@ -295,18 +296,20 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, amdgpu_res_first(prange->ttm_res, ttm_res_offset, npages << PAGE_SHIFT, &cursor); - for (i = j = 0; i < npages; i++) { + for (i = j = 0; (i < npages) && (mpages < migrate->cpages); i++) { struct page *spage; - dst[i] = cursor.start + (j << PAGE_SHIFT); - migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); - svm_migrate_get_vram_page(prange, migrate->dst[i]); - migrate->dst[i] = migrate_pfn(migrate->dst[i]); - + if (migrate->src[i] & MIGRATE_PFN_MIGRATE) { + dst[i] = cursor.start + (j << PAGE_SHIFT); + migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); + svm_migrate_get_vram_page(prange, migrate->dst[i]); + migrate->dst[i] = migrate_pfn(migrate->dst[i]); + mpages++; + } spage = migrate_pfn_to_page(migrate->src[i]); if (spage && !is_zone_device_page(spage)) { src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE, - DMA_TO_DEVICE); + DMA_BIDIRECTIONAL); r = dma_mapping_error(dev, src[i]); if (r) { dev_err(dev, "%s: fail %d dma_map_page\n", @@ -353,9 +356,12 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, out_free_vram_pages: if (r) { pr_debug("failed %d to copy memory to vram\n", r); - while (i--) { + for (i = 0; i < npages && mpages; i++) { + if (!dst[i]) + continue; svm_migrate_put_vram_page(adev, dst[i]); migrate->dst[i] = 0; + mpages--; } } @@ -445,14 +451,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", mpages, cpages, migrate.npages); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - 0, node->id, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + 0, node->id, trigger, r); out: if (!r && mpages) { pdd = svm_range_get_pdd_by_node(prange, node); @@ -509,10 +514,19 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, start = start_mgr << PAGE_SHIFT; end = (last_mgr + 1) << PAGE_SHIFT; + r = amdgpu_amdkfd_reserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); + if (r) { + dev_dbg(node->adev->dev, "failed to reserve VRAM, r: %ld\n", r); + return -ENOSPC; + } + r = svm_range_vram_node_new(node, prange, true); if (r) { dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r); - return r; + goto out; } ttm_res_offset = (start_mgr - prange->start + prange->offset) << PAGE_SHIFT; @@ -545,6 +559,11 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, svm_range_vram_node_free(prange); } +out: + amdgpu_amdkfd_unreserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); return r < 0 ? r : 0; } @@ -616,7 +635,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } - dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE); + dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); r = dma_mapping_error(dev, dst[i]); if (r) { dev_err(adev->dev, "%s: fail %d dma_map_page\n", __func__, r); @@ -737,14 +756,13 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(&migrate); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - node->id, 0, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + node->id, 0, trigger, r); out: if (!r && cpages) { mpages = cpages - upages; @@ -1009,7 +1027,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) return -EINVAL; - if (adev->gmc.is_app_apu) + if (adev->apu_prefer_gtt) return 0; pgmap = &kfddev->pgmap; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 050a6936ff84..d9ae854b6908 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -118,18 +118,20 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * attention grabbing. */ if (gfx_info->max_shader_engines > KFD_MAX_NUM_SE) { - pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", - gfx_info->max_shader_engines); + dev_err(mm->dev->adev->dev, + "Exceeded KFD_MAX_NUM_SE, chip reports %d\n", + gfx_info->max_shader_engines); return; } if (gfx_info->max_sh_per_se > KFD_MAX_NUM_SH_PER_SE) { - pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n", + dev_err(mm->dev->adev->dev, + "Exceeded KFD_MAX_NUM_SH, chip reports %d\n", gfx_info->max_sh_per_se * gfx_info->max_shader_engines); return; } cu_bitmap_sh_mul = (KFD_GC_VERSION(mm->dev) >= IP_VERSION(11, 0, 0) && - KFD_GC_VERSION(mm->dev) < IP_VERSION(12, 0, 0)) ? 2 : 1; + KFD_GC_VERSION(mm->dev) < IP_VERSION(13, 0, 0)) ? 2 : 1; /* Count active CUs per SH. * @@ -223,7 +225,7 @@ void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { if (mqd_mem_obj->gtt_mem) { - amdgpu_amdkfd_free_gtt_mem(mm->dev->adev, mqd_mem_obj->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(mm->dev->adev, &mqd_mem_obj->gtt_mem); kfree(mqd_mem_obj); } else { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); @@ -290,3 +292,21 @@ uint64_t kfd_mqd_stride(struct mqd_manager *mm, { return mm->mqd_size; } + +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, + uint32_t inst) +{ + if (doorbell_id) { + struct device *dev = node->adev->dev; + + if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0) + dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n", + inst, doorbell_id); + else + dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n", + doorbell_id); + return true; + } + + return false; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index e5cc697a3ca8..17cc1f25c8d0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -119,7 +119,7 @@ struct mqd_manager { #if defined(CONFIG_DEBUG_FS) int (*debugfs_show_mqd)(struct seq_file *m, void *data); #endif - uint32_t (*read_doorbell_id)(void *mqd); + bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd); uint64_t (*mqd_stride)(struct mqd_manager *mm, struct queue_properties *p); @@ -198,4 +198,6 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev); uint64_t kfd_mqd_stride(struct mqd_manager *mm, struct queue_properties *q); +bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, + uint32_t inst); #endif /* KFD_MQD_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 1a4a69943c71..05f3ac2eaef9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -206,11 +206,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct cik_mqd *m = (struct cik_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static void update_mqd(struct mqd_manager *mm, void *mqd, @@ -423,7 +423,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; break; case KFD_MQD_TYPE_DIQ: mqd->allocate_mqd = allocate_mqd; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index 22cbfa1bdadd..1695dd78ede8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -107,6 +107,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -167,10 +169,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); @@ -224,11 +226,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static int get_wave_state(struct mqd_manager *mm, void *mqd, @@ -488,7 +490,7 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; pr_debug("%s@%i\n", __func__, __LINE__); break; case KFD_MQD_TYPE_DIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 826bc4f6c8a7..3c0ae28c5923 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -154,6 +154,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -221,10 +223,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); @@ -278,11 +279,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static int get_wave_state(struct mqd_manager *mm, void *mqd, @@ -517,7 +518,7 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; pr_debug("%s@%i\n", __func__, __LINE__); break; case KFD_MQD_TYPE_DIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c new file mode 100644 index 000000000000..565858b9044d --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v12_structs.h" +#include "gc/gc_12_0_0_sh_mask.h" +#include "amdgpu_amdkfd.h" + +static inline struct v12_compute_mqd *get_mqd(void *mqd) +{ + return (struct v12_compute_mqd *)mqd; +} + +static inline struct v12_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v12_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct mqd_update_info *minfo) +{ + struct v12_compute_mqd *m; + uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; + + if (!minfo || !minfo->cu_mask.ptr) + return; + + mqd_symmetrically_map_cu_mask(mm, + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + m->compute_static_thread_mgmt_se4 = se_mask[4]; + m->compute_static_thread_mgmt_se5 = se_mask[5]; + m->compute_static_thread_mgmt_se6 = se_mask[6]; + m->compute_static_thread_mgmt_se7 = se_mask[7]; + + pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3, + m->compute_static_thread_mgmt_se4, + m->compute_static_thread_mgmt_se5, + m->compute_static_thread_mgmt_se6, + m->compute_static_thread_mgmt_se7); +} + +static void set_priority(struct v12_compute_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + /* + * Allocate one PAGE_SIZE memory for MQD as MES writes to areas beyond + * struct MQD size. + */ + if (kfd_gtt_sa_allocate(node, PAGE_SIZE, &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v12_compute_mqd *m; + + m = (struct v12_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, PAGE_SIZE); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + /* Set cp_hqd_hq_status0.c_queue_debug_en to 1 to have the CP set up the + * DISPATCH_PTR. This is required for the kfd debugger + */ + m->cp_hqd_hq_status0 = 1 << 14; + + if (amdgpu_amdkfd_have_atomics_support(mm->dev->adev)) + m->cp_hqd_hq_status0 |= 1 << 29; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->kfd->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q, NULL); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms, 0); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->kfd->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, minfo); + set_priority(m, q); + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) +{ + struct v12_compute_mqd *m = (struct v12_compute_mqd *)mqd; + + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v12_compute_mqd *m; + struct mqd_user_context_save_area_header header; + + m = get_mqd(mqd); + + /* Control stack is written backwards, while workgroup context data + * is written forwards. Both starts from m->cp_hqd_cntl_stack_size. + * Current position is at m->cp_hqd_cntl_stack_offset and + * m->cp_hqd_wg_state_offset, respectively. + */ + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + /* Control stack is not copied to user mode for GFXv12 because + * it's part of the context save area that is already + * accessible to user mode + */ + header.control_stack_size = *ctl_stack_used_size; + header.wave_state_size = *save_area_used_size; + + header.wave_state_offset = m->cp_hqd_wg_state_offset; + header.control_stack_offset = m->cp_hqd_cntl_stack_offset; + + if (copy_to_user(ctl_stack, &header, sizeof(header))) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_sdma_mqd *m; + + m = (struct v12_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, sizeof(struct v12_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q, NULL); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdmax_rlcx_sched_cntl = (amdgpu_sdma_phase_quantum + << SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM__SHIFT) + & SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM_MASK; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, + struct kfd_node *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); + mqd->get_wave_state = get_wave_state; + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = kfd_hiq_load_mqd_kiq; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + mqd->check_preemption_failed = check_preemption_failed; + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = kfd_load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = kfd_destroy_mqd_sdma; + mqd->is_occupied = kfd_is_occupied_sdma; + mqd->mqd_size = sizeof(struct v12_sdma_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 697b6d530d12..97933d2a3803 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -77,7 +77,9 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se1 = se_mask[1]; m->compute_static_thread_mgmt_se2 = se_mask[2]; m->compute_static_thread_mgmt_se3 = se_mask[3]; - if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 5, 0)) { m->compute_static_thread_mgmt_se4 = se_mask[4]; m->compute_static_thread_mgmt_se5 = se_mask[5]; m->compute_static_thread_mgmt_se6 = se_mask[6]; @@ -181,6 +183,9 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -243,7 +248,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); @@ -299,7 +304,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, if (mm->dev->kfd->cwsr_enabled && q->ctx_save_restore_area_address) m->cp_hqd_ctx_save_control = 0; - if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 5, 0)) update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); @@ -316,11 +323,14 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct v9_mqd *m = (struct v9_mqd *)mqd; + uint32_t doorbell_id = m->queue_doorbell_id0; + + m->queue_doorbell_id0 = 0; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); } static int get_wave_state(struct mqd_manager *mm, void *mqd, @@ -485,6 +495,10 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + /* Allow context switch so we don't cross-process starve with a massive + * command buffer of long-running SDMA commands + */ + m->sdmax_rlcx_ib_cntl |= SDMA0_GFX_IB_CNTL__SWITCH_INSIDE_IB_MASK; q->is_active = QUEUE_IS_ACTIVE(*q); } @@ -544,6 +558,9 @@ static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev); if (xcc == 0) { /* Set no_update_rptr = 0 in Master XCC */ @@ -607,6 +624,25 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, return err; } +static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) +{ + uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); + uint32_t xcc_mask = mm->dev->xcc_mask; + int inst = 0, xcc_id; + struct v9_mqd *m; + bool ret = false; + + for_each_inst(xcc_id, xcc_mask) { + m = get_mqd(mqd + hiq_mqd_size * inst); + ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, + m->queue_doorbell_id0, inst); + m->queue_doorbell_id0 = 0; + ++inst; + } + + return ret; +} + static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj, struct kfd_mem_obj *xcc_mqd_mem_obj, uint64_t offset) @@ -635,7 +671,9 @@ static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc); init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q); - + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = offset; /* @@ -695,7 +733,10 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd + size * xcc); update_mqd(mm, m, q, minfo); - update_cu_mask(mm, mqd, minfo, xcc); + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; + update_cu_mask(mm, m, minfo, xcc); if (q->format == KFD_QUEUE_FORMAT_AQL) { switch (xcc) { @@ -717,6 +758,21 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, } } +static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *qp, + const void *mqd_src, + const void *ctl_stack_src, u32 ctl_stack_size) +{ + restore_mqd(mm, mqd, mqd_mem_obj, gart_addr, qp, mqd_src, ctl_stack_src, ctl_stack_size); + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) { + struct v9_mqd *m; + + m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr; + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; + } +} static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) @@ -851,22 +907,25 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->is_occupied = kfd_is_occupied_cp; mqd->get_checkpoint_info = get_checkpoint_info; mqd->checkpoint_mqd = checkpoint_mqd; - mqd->restore_mqd = restore_mqd; mqd->mqd_size = sizeof(struct v9_mqd); mqd->mqd_stride = mqd_stride_v9; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) { mqd->init_mqd = init_mqd_v9_4_3; mqd->load_mqd = load_mqd_v9_4_3; mqd->update_mqd = update_mqd_v9_4_3; + mqd->restore_mqd = restore_mqd_v9_4_3; mqd->destroy_mqd = destroy_mqd_v9_4_3; mqd->get_wave_state = get_wave_state_v9_4_3; } else { mqd->init_mqd = init_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; + mqd->restore_mqd = restore_mqd; mqd->destroy_mqd = kfd_destroy_mqd_cp; mqd->get_wave_state = get_wave_state; } @@ -881,15 +940,19 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { + mqd->check_preemption_failed = check_preemption_failed; + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) { mqd->init_mqd = init_mqd_hiq_v9_4_3; mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3; mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3; + mqd->check_preemption_failed = check_preemption_failed_v9_4_3; } else { mqd->init_mqd = init_mqd_hiq; mqd->load_mqd = kfd_hiq_load_mqd_kiq; mqd->destroy_mqd = destroy_hiq_mqd; + mqd->check_preemption_failed = check_preemption_failed; } break; case KFD_MQD_TYPE_DIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 3e1a574d4ea6..c1fafc502515 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -237,11 +237,11 @@ static void __update_mqd(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -static uint32_t read_doorbell_id(void *mqd) +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) { struct vi_mqd *m = (struct vi_mqd *)mqd; - return m->queue_doorbell_id0; + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); } static void update_mqd(struct mqd_manager *mm, void *mqd, @@ -482,7 +482,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - mqd->read_doorbell_id = read_doorbell_id; + mqd->check_preemption_failed = check_preemption_failed; break; case KFD_MQD_TYPE_DIQ: mqd->allocate_mqd = allocate_mqd; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index d6f65f39072b..b1a6eb349bb3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -28,6 +28,11 @@ #include "kfd_kernel_queue.h" #include "kfd_priv.h" +#define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0) +#define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1) +#define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2) +#define OVER_SUBSCRIPTION_XNACK_CONFLICT (1 << 3) + static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, unsigned int buffer_size_bytes) { @@ -40,12 +45,14 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, - bool *over_subscription) + int *over_subscription, + int xnack_conflict) { unsigned int process_count, queue_count, compute_queue_count, gws_queue_count; unsigned int map_queue_size; unsigned int max_proc_per_quantum = 1; - struct kfd_node *dev = pm->dqm->dev; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; process_count = pm->dqm->processes_count; queue_count = pm->dqm->active_queue_count; @@ -57,17 +64,22 @@ static void pm_calc_rlib_size(struct packet_manager *pm, * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - *over_subscription = false; + *over_subscription = 0; - if (dev->max_proc_per_quantum > 1) - max_proc_per_quantum = dev->max_proc_per_quantum; + if (node->max_proc_per_quantum > 1) + max_proc_per_quantum = node->max_proc_per_quantum; - if ((process_count > max_proc_per_quantum) || - compute_queue_count > get_cp_queues_num(pm->dqm) || - gws_queue_count > 1) { - *over_subscription = true; - pr_debug("Over subscribed runlist\n"); - } + if (process_count > max_proc_per_quantum) + *over_subscription |= OVER_SUBSCRIPTION_PROCESS_COUNT; + if (compute_queue_count > get_cp_queues_num(pm->dqm)) + *over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT; + if (gws_queue_count > 1) + *over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT; + if (xnack_conflict && (node->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN)) + *over_subscription |= OVER_SUBSCRIPTION_XNACK_CONFLICT; + + if (*over_subscription) + dev_dbg(dev, "Over subscribed runlist\n"); map_queue_size = pm->pmf->map_queues_size; /* calculate run list ib allocation size */ @@ -81,29 +93,32 @@ static void pm_calc_rlib_size(struct packet_manager *pm, if (*over_subscription) *rlib_size += pm->pmf->runlist_size; - pr_debug("runlist ib size %d\n", *rlib_size); + dev_dbg(dev, "runlist ib size %d\n", *rlib_size); } static int pm_allocate_runlist_ib(struct packet_manager *pm, unsigned int **rl_buffer, uint64_t *rl_gpu_buffer, unsigned int *rl_buffer_size, - bool *is_over_subscription) + int *is_over_subscription, + int xnack_conflict) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval; if (WARN_ON(pm->allocated)) return -EINVAL; - pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription, + xnack_conflict); mutex_lock(&pm->lock); - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, - &pm->ib_buffer_obj); + retval = kfd_gtt_sa_allocate(node, *rl_buffer_size, &pm->ib_buffer_obj); if (retval) { - pr_err("Failed to allocate runlist IB\n"); + dev_err(dev, "Failed to allocate runlist IB\n"); goto out; } @@ -125,32 +140,54 @@ static int pm_create_runlist_ib(struct packet_manager *pm, { unsigned int alloc_size_bytes; unsigned int *rl_buffer, rl_wptr, i; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval, processes_mapped; struct device_process_node *cur; struct qcm_process_device *qpd; struct queue *q; struct kernel_queue *kq; - bool is_over_subscription; + int is_over_subscription; + int xnack_enabled = -1; + bool xnack_conflict = 0; rl_wptr = retval = processes_mapped = 0; + /* Check if processes set different xnack modes */ + list_for_each_entry(cur, queues, list) { + qpd = cur->qpd; + if (xnack_enabled < 0) + /* First process */ + xnack_enabled = qpd->pqm->process->xnack_enabled; + else if (qpd->pqm->process->xnack_enabled != xnack_enabled) { + /* Found a process with a different xnack mode */ + xnack_conflict = 1; + break; + } + } + retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, - &alloc_size_bytes, &is_over_subscription); + &alloc_size_bytes, &is_over_subscription, + xnack_conflict); if (retval) return retval; *rl_size_bytes = alloc_size_bytes; pm->ib_size_bytes = alloc_size_bytes; - pr_debug("Building runlist ib process count: %d queues count %d\n", + dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->active_queue_count); +build_runlist_ib: /* build the run list ib packet */ list_for_each_entry(cur, queues, list) { qpd = cur->qpd; + /* group processes with the same xnack mode together */ + if (qpd->pqm->process->xnack_enabled != xnack_enabled) + continue; /* build map process packet */ if (processes_mapped >= pm->dqm->processes_count) { - pr_debug("Not enough space left in runlist IB\n"); + dev_dbg(dev, "Not enough space left in runlist IB\n"); pm_release_ib(pm); return -ENOMEM; } @@ -167,7 +204,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!kq->queue->properties.is_active) continue; - pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -186,7 +224,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!q->properties.is_active) continue; - pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -202,18 +241,33 @@ static int pm_create_runlist_ib(struct packet_manager *pm, alloc_size_bytes); } } + if (xnack_conflict) { + /* pick up processes with the other xnack mode */ + xnack_enabled = !xnack_enabled; + xnack_conflict = 0; + goto build_runlist_ib; + } - pr_debug("Finished map process and queues to runlist\n"); + dev_dbg(dev, "Finished map process and queues to runlist\n"); if (is_over_subscription) { if (!pm->is_over_subscription) - pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n"); + dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s%s. Expect reduced ROCm performance.\n", + is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ? + " too many processes" : "", + is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ? + " too many queues" : "", + is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ? + " multiple processes using cooperative launch" : "", + is_over_subscription & OVER_SUBSCRIPTION_XNACK_CONFLICT ? + " xnack on/off processes mixed on gfx9" : ""); + retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), true); } - pm->is_over_subscription = is_over_subscription; + pm->is_over_subscription = !!is_over_subscription; for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) pr_debug("0x%2X ", rl_buffer[i]); @@ -239,7 +293,9 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) break; default: if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2) || - KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3)) + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 5, 0)) pm->pmf = &kfd_aldebaran_pm_funcs; else if (KFD_GC_VERSION(dqm->dev) >= IP_VERSION(9, 0, 1)) pm->pmf = &kfd_v9_pm_funcs; @@ -262,16 +318,18 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) return 0; } -void pm_uninit(struct packet_manager *pm, bool hanging) +void pm_uninit(struct packet_manager *pm) { mutex_destroy(&pm->lock); - kernel_queue_uninit(pm->priv_queue, hanging); + kernel_queue_uninit(pm->priv_queue); pm->priv_queue = NULL; } int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -281,7 +339,7 @@ int pm_send_set_resources(struct packet_manager *pm, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -343,6 +401,8 @@ fail_create_runlist_ib: int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint64_t fence_value) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -354,7 +414,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -370,12 +430,33 @@ out: return retval; } -int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) +/* pm_config_dequeue_wait_counts: Configure dequeue timer Wait Counts + * by writing to CP_IQ_WAIT_TIME2 registers. + * + * @cmd: See emum kfd_config_dequeue_wait_counts_cmd definition + * @value: Depends on the cmd. This parameter is unused for + * KFD_DEQUEUE_WAIT_INIT and KFD_DEQUEUE_WAIT_RESET. For + * KFD_DEQUEUE_WAIT_SET_SCH_WAVE it holds value to be set + * + */ +int pm_config_dequeue_wait_counts(struct packet_manager *pm, + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t value) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval = 0; uint32_t *buffer, size; - size = pm->pmf->set_grace_period_size; + if (!pm->pmf->config_dequeue_wait_counts || + !pm->pmf->config_dequeue_wait_counts_size) + return 0; + + if (cmd == KFD_DEQUEUE_WAIT_INIT && (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) || + KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0))) + return 0; + + size = pm->pmf->config_dequeue_wait_counts_size; mutex_lock(&pm->lock); @@ -385,18 +466,24 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, + "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } - retval = pm->pmf->set_grace_period(pm, buffer, grace_period); - if (!retval) + retval = pm->pmf->config_dequeue_wait_counts(pm, buffer, + cmd, value); + if (!retval) { retval = kq_submit_packet(pm->priv_queue); - else + + /* If default value is modified, cache that in dqm->wait_times */ + if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT) + update_dqm_wait_times(pm->dqm); + } else { kq_rollback_packet(pm->priv_queue); + } } - out: mutex_unlock(&pm->lock); return retval; @@ -406,6 +493,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_unmap_queues_filter filter, uint32_t filter_param, bool reset) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -414,7 +503,7 @@ int pm_send_unmap_queue(struct packet_manager *pm, kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -463,6 +552,8 @@ out: int pm_debugfs_hang_hws(struct packet_manager *pm) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int r = 0; @@ -474,16 +565,16 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); r = -ENOMEM; goto out; } memset(buffer, 0x55, size); kq_submit_packet(pm->priv_queue); - pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", - buffer[0], buffer[1], buffer[2], buffer[3], - buffer[4], buffer[5], buffer[6]); + dev_info(dev, "Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", + buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], + buffer[5], buffer[6]); out: mutex_unlock(&pm->lock); return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 8ee2bedd301a..505036968a77 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -37,14 +37,17 @@ static int pm_map_process_v9(struct packet_manager *pm, struct kfd_node *kfd = pm->dqm->dev; struct kfd_process_device *pdd = container_of(qpd, struct kfd_process_device, qpd); + struct amdgpu_device *adev = kfd->adev; packet = (struct pm4_mes_map_process *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_process)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process)); + if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE) + packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; - packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields2.pasid = pdd->pasid; packet->bitfields14.gds_size = qpd->gds_size & 0x3F; packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0; @@ -89,17 +92,22 @@ static int pm_map_process_aldebaran(struct packet_manager *pm, struct pm4_mes_map_process_aldebaran *packet; uint64_t vm_page_table_base_addr = qpd->page_table_base; struct kfd_dev *kfd = pm->dqm->dev->kfd; + struct kfd_node *knode = pm->dqm->dev; struct kfd_process_device *pdd = container_of(qpd, struct kfd_process_device, qpd); int i; + struct amdgpu_device *adev = kfd->adev; packet = (struct pm4_mes_map_process_aldebaran *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process_aldebaran)); + if (adev->enforce_isolation[knode->node_id] == + AMDGPU_ENFORCE_ISOLATION_ENABLE) + packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; - packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields2.pasid = pdd->pasid; packet->bitfields14.gds_size = qpd->gds_size & 0x3F; packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0; @@ -144,18 +152,23 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, int concurrent_proc_cnt = 0; struct kfd_node *kfd = pm->dqm->dev; + struct amdgpu_device *adev = kfd->adev; /* Determine the number of processes to map together to HW: * it can not exceed the number of VMIDs available to the * scheduler, and it is determined by the smaller of the number * of processes in the runlist and kfd module parameter * hws_max_conc_proc. + * However, if enforce_isolation is set (toggle LDS/VGPRs/SGPRs + * cleaner between process switch), enable single-process mode + * in HWS. * Note: the arbitration between the number of VMIDs and * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - concurrent_proc_cnt = min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); + concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] == + AMDGPU_ENFORCE_ISOLATION_ENABLE) ? + 1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum); packet = (struct pm4_mes_runlist *)buffer; @@ -190,6 +203,8 @@ static int pm_set_resources_v9(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_set_resources__hsa_interface_queue_hiq; packet->bitfields2.vmid_mask = res->vmid_mask; packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; + if (pm->dqm->dev->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN) + packet->bitfields2.enb_xnack_retry_disable_check = 1; packet->bitfields7.oac_mask = res->oac_mask; packet->bitfields8.gds_heap_base = res->gds_heap_base; packet->bitfields8.gds_heap_size = res->gds_heap_size; @@ -213,7 +228,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, struct queue *q, bool is_static) { struct pm4_mes_map_queues *packet; - bool use_static = is_static; packet = (struct pm4_mes_map_queues *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); @@ -226,7 +240,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, packet->bitfields2.engine_sel = engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.gws_control_queue = q->gws ? 1 : 0; + packet->bitfields2.gws_control_queue = q->properties.is_gws ? 1 : 0; packet->bitfields2.extended_engine_sel = extended_engine_sel__mes_map_queues__legacy_engine_sel; packet->bitfields2.queue_type = @@ -234,7 +248,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, switch (q->properties.type) { case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) + if (is_static) packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_latency_static_queue_vi; break; @@ -244,7 +258,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, break; case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: - use_static = false; /* no static queues under SDMA */ if (q->properties.sdma_engine_id < 2 && !pm_use_ext_eng(q->device->kfd)) packet->bitfields2.engine_sel = q->properties.sdma_engine_id + @@ -287,23 +300,79 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, return 0; } -static int pm_set_grace_period_v9(struct packet_manager *pm, +static inline void pm_build_dequeue_wait_counts_packet_info(struct packet_manager *pm, + uint32_t sch_value, uint32_t que_sleep, uint32_t *reg_offset, + uint32_t *reg_data) +{ + pm->dqm->dev->kfd2kgd->build_dequeue_wait_counts_packet_info( + pm->dqm->dev->adev, + pm->dqm->wait_times, + sch_value, + que_sleep, + reg_offset, + reg_data); +} + +/* pm_config_dequeue_wait_counts_v9: Builds WRITE_DATA packet with + * register/value for configuring dequeue wait counts + * + * @return: -ve for failure and 0 for success and buffer is + * filled in with packet + * + **/ +static int pm_config_dequeue_wait_counts_v9(struct packet_manager *pm, uint32_t *buffer, - uint32_t grace_period) + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t value) { struct pm4_mec_write_data_mmio *packet; uint32_t reg_offset = 0; uint32_t reg_data = 0; - pm->dqm->dev->kfd2kgd->build_grace_period_packet_info( - pm->dqm->dev->adev, - pm->dqm->wait_times, - grace_period, - ®_offset, - ®_data); + switch (cmd) { + case KFD_DEQUEUE_WAIT_INIT: { + uint32_t sch_wave = 0, que_sleep = 1; + + /* For all gfx9 ASICs > gfx941, + * Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default 0x40. + * On a 1GHz machine this is roughly 1 microsecond, which is + * about how long it takes to load data out of memory during + * queue connect + * QUE_SLEEP: Wait Count for Dequeue Retry. + * + * Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU + */ + if (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) || + KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0)) + return -EPERM; + + if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev->gmc.is_app_apu && + (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3))) + sch_wave = 1; - if (grace_period == USE_DEFAULT_GRACE_PERIOD) - reg_data = pm->dqm->wait_times; + pm_build_dequeue_wait_counts_packet_info(pm, sch_wave, que_sleep, + ®_offset, ®_data); + + break; + } + case KFD_DEQUEUE_WAIT_RESET: + /* reg_data would be set to dqm->wait_times */ + pm_build_dequeue_wait_counts_packet_info(pm, 0, 0, ®_offset, ®_data); + break; + + case KFD_DEQUEUE_WAIT_SET_SCH_WAVE: + /* The CP cannot handle value 0 and it will result in + * an infinite grace period being set so set to 1 to prevent this. Also + * avoid debugger API breakage as it sets 0 and expects a low value. + */ + if (!value) + value = 1; + pm_build_dequeue_wait_counts_packet_info(pm, value, 0, ®_offset, ®_data); + break; + default: + pr_err("Invalid dequeue wait cmd\n"); + return -EINVAL; + } packet = (struct pm4_mec_write_data_mmio *)buffer; memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio)); @@ -405,7 +474,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = { .set_resources = pm_set_resources_v9, .map_queues = pm_map_queues_v9, .unmap_queues = pm_unmap_queues_v9, - .set_grace_period = pm_set_grace_period_v9, + .config_dequeue_wait_counts = pm_config_dequeue_wait_counts_v9, .query_status = pm_query_status_v9, .release_mem = NULL, .map_process_size = sizeof(struct pm4_mes_map_process), @@ -413,7 +482,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = { .set_resources_size = sizeof(struct pm4_mes_set_resources), .map_queues_size = sizeof(struct pm4_mes_map_queues), .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .set_grace_period_size = sizeof(struct pm4_mec_write_data_mmio), + .config_dequeue_wait_counts_size = sizeof(struct pm4_mec_write_data_mmio), .query_status_size = sizeof(struct pm4_mes_query_status), .release_mem_size = 0, }; @@ -424,7 +493,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = { .set_resources = pm_set_resources_v9, .map_queues = pm_map_queues_v9, .unmap_queues = pm_unmap_queues_v9, - .set_grace_period = pm_set_grace_period_v9, + .config_dequeue_wait_counts = pm_config_dequeue_wait_counts_v9, .query_status = pm_query_status_v9, .release_mem = NULL, .map_process_size = sizeof(struct pm4_mes_map_process_aldebaran), @@ -432,7 +501,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = { .set_resources_size = sizeof(struct pm4_mes_set_resources), .map_queues_size = sizeof(struct pm4_mes_map_queues), .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .set_grace_period_size = sizeof(struct pm4_mec_write_data_mmio), + .config_dequeue_wait_counts_size = sizeof(struct pm4_mec_write_data_mmio), .query_status_size = sizeof(struct pm4_mes_query_status), .release_mem_size = 0, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c index c1199d06d131..a1de5d7e173a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c @@ -42,6 +42,7 @@ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, struct qcm_process_device *qpd) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); struct pm4_mes_map_process *packet; packet = (struct pm4_mes_map_process *)buffer; @@ -52,7 +53,7 @@ static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, sizeof(struct pm4_mes_map_process)); packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; - packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields2.pasid = pdd->pasid; packet->bitfields3.page_table_base = qpd->page_table_base; packet->bitfields10.gds_size = qpd->gds_size; packet->bitfields10.num_gws = qpd->num_gws; @@ -303,7 +304,7 @@ const struct packet_manager_funcs kfd_vi_pm_funcs = { .set_resources = pm_set_resources_vi, .map_queues = pm_map_queues_vi, .unmap_queues = pm_unmap_queues_vi, - .set_grace_period = NULL, + .config_dequeue_wait_counts = NULL, .query_status = pm_query_status_vi, .release_mem = pm_release_mem_vi, .map_process_size = sizeof(struct pm4_mes_map_process), @@ -311,7 +312,7 @@ const struct packet_manager_funcs kfd_vi_pm_funcs = { .set_resources_size = sizeof(struct pm4_mes_set_resources), .map_queues_size = sizeof(struct pm4_mes_map_queues), .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .set_grace_period_size = 0, + .config_dequeue_wait_counts_size = 0, .query_status_size = sizeof(struct pm4_mes_query_status), .release_mem_size = sizeof(struct pm4_mec_release_mem) }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c deleted file mode 100644 index e3b250918f39..000000000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright 2014-2022 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include <linux/types.h> -#include "kfd_priv.h" -#include "amdgpu_ids.h" - -static unsigned int pasid_bits = 16; -static bool pasids_allocated; /* = false */ - -bool kfd_set_pasid_limit(unsigned int new_limit) -{ - if (new_limit < 2) - return false; - - if (new_limit < (1U << pasid_bits)) { - if (pasids_allocated) - /* We've already allocated user PASIDs, too late to - * change the limit - */ - return false; - - while (new_limit < (1U << pasid_bits)) - pasid_bits--; - } - - return true; -} - -unsigned int kfd_get_pasid_limit(void) -{ - return 1U << pasid_bits; -} - -u32 kfd_pasid_alloc(void) -{ - int r = amdgpu_pasid_alloc(pasid_bits); - - if (r > 0) { - pasids_allocated = true; - return r; - } - - return 0; -} - -void kfd_pasid_free(u32 pasid) -{ - amdgpu_pasid_free(pasid); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h index 8b6b2bd5c148..e356a207d03c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h @@ -63,7 +63,8 @@ struct pm4_mes_set_resources { struct { uint32_t vmid_mask:16; uint32_t unmap_latency:8; - uint32_t reserved1:5; + uint32_t reserved1:4; + uint32_t enb_xnack_retry_disable_check:1; enum mes_set_resources_queue_type_enum queue_type:3; } bitfields2; uint32_t ordinal2; @@ -145,8 +146,9 @@ struct pm4_mes_map_process { union { struct { - uint32_t pasid:16; - uint32_t reserved1:2; + uint32_t pasid:16; /* 0 - 15 */ + uint32_t reserved1:1; /* 16 */ + uint32_t exec_cleaner_shader:1; /* 17 */ uint32_t debug_vmid:4; uint32_t new_debug:1; uint32_t reserved2:1; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h index 38f5cb6a222a..e0ed62c4ade0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_aldebaran.h @@ -37,7 +37,7 @@ struct pm4_mes_map_process_aldebaran { struct { uint32_t pasid:16; /* 0 - 15 */ uint32_t single_memops:1; /* 16 */ - uint32_t reserved1:1; /* 17 */ + uint32_t exec_cleaner_shader:1; /* 17 */ uint32_t debug_vmid:4; /* 18 - 21 */ uint32_t new_debug:1; /* 22 */ uint32_t tmz:1; /* 23 */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 42d40560cd30..d221c58dccc3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -32,7 +32,7 @@ #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/spinlock.h> -#include <linux/kfd_ioctl.h> +#include <uapi/linux/kfd_ioctl.h> #include <linux/idr.h> #include <linux/kfifo.h> #include <linux/seq_file.h> @@ -206,7 +206,9 @@ enum cache_policy { #define KFD_IS_SOC15(dev) ((KFD_GC_VERSION(dev)) >= (IP_VERSION(9, 0, 1))) #define KFD_SUPPORT_XNACK_PER_PROCESS(dev)\ ((KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) || \ - (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3))) + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) || \ + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) || \ + (KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0))) struct kfd_node; @@ -272,7 +274,6 @@ struct kfd_node { /* Interrupts */ struct kfifo ih_fifo; - struct workqueue_struct *ih_wq; struct work_struct interrupt_work; spinlock_t interrupt_lock; @@ -288,7 +289,6 @@ struct kfd_node { /* Global GWS resource shared between processes */ void *gws; - bool gws_debug_workaround; /* Clients watching SMI events */ struct list_head smi_clients; @@ -309,6 +309,10 @@ struct kfd_node { struct kfd_local_mem_info local_mem_info; struct kfd_dev *kfd; + + /* Track per device allocated watch points */ + uint32_t alloc_watch_ids; + spinlock_t watch_points_lock; }; struct kfd_dev { @@ -361,9 +365,7 @@ struct kfd_dev { struct kfd_node *nodes[MAX_KFD_NODES]; unsigned int num_nodes; - /* Track per device allocated watch points */ - uint32_t alloc_watch_ids; - spinlock_t watch_points_lock; + struct workqueue_struct *ih_wq; /* Kernel doorbells for KFD device */ struct amdgpu_bo *doorbells; @@ -413,13 +415,16 @@ enum kfd_unmap_queues_filter { * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. * * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface. + * + * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: SDMA user mode queue with target SDMA engine ID. */ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, KFD_QUEUE_TYPE_SDMA, KFD_QUEUE_TYPE_HIQ, KFD_QUEUE_TYPE_DIQ, - KFD_QUEUE_TYPE_SDMA_XGMI + KFD_QUEUE_TYPE_SDMA_XGMI, + KFD_QUEUE_TYPE_SDMA_BY_ENG_ID }; enum kfd_queue_format { @@ -493,8 +498,8 @@ struct queue_properties { uint64_t queue_size; uint32_t priority; uint32_t queue_percent; - uint32_t *read_ptr; - uint32_t *write_ptr; + void __user *read_ptr; + void __user *write_ptr; void __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; @@ -521,6 +526,12 @@ struct queue_properties { uint64_t tba_addr; uint64_t tma_addr; uint64_t exception_status; + + struct amdgpu_bo *wptr_bo; + struct amdgpu_bo *rptr_bo; + struct amdgpu_bo *ring_bo; + struct amdgpu_bo *eop_buf_bo; + struct amdgpu_bo *cwsr_bo; }; #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ @@ -603,7 +614,7 @@ struct queue { uint64_t gang_ctx_gpu_addr; void *gang_ctx_cpu_ptr; - struct amdgpu_bo *wptr_bo; + struct amdgpu_bo *wptr_bo_gart; }; enum KFD_MQD_TYPE { @@ -765,7 +776,7 @@ struct kfd_process_device { enum kfd_pdd_bound bound; /* VRAM usage */ - uint64_t vram_usage; + atomic64_t vram_usage; struct attribute attr_vram; char vram_filename[MAX_SYSFS_FILENAME_LEN]; @@ -836,6 +847,11 @@ struct kfd_process_device { void *proc_ctx_bo; uint64_t proc_ctx_gpu_addr; void *proc_ctx_cpu_ptr; + + /* Tracks queue reset status */ + bool has_reset_queue; + + u32 pasid; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -853,6 +869,14 @@ struct svm_range_list { struct delayed_work restore_work; DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE); struct task_struct *faulting_task; + /* check point ts decides if page fault recovery need be dropped */ + uint64_t checkpoint_ts[MAX_GPU_INSTANCE]; + + /* Default granularity to use in buffer migration + * and restoration of backing memory while handling + * recoverable page faults + */ + uint8_t default_granularity; }; /* Process data */ @@ -887,8 +911,6 @@ struct kfd_process { /* We want to receive a notification when the mm_struct is destroyed */ struct mmu_notifier mmu_notifier; - u32 pasid; - /* * Array of kfd_process_device pointers, * one for each device the process is using. @@ -981,6 +1003,9 @@ struct kfd_process { struct semaphore runtime_enable_sema; bool is_runtime_retry; struct kfd_runtime_info runtime_info; + + /* if gpu page fault sent to KFD */ + bool gpu_page_fault; }; #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ @@ -1013,7 +1038,8 @@ void kfd_process_destroy_wq(void); void kfd_cleanup_processes(void); struct kfd_process *kfd_create_process(struct task_struct *thread); struct kfd_process *kfd_get_process(const struct task_struct *task); -struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid); +struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid, + struct kfd_process_device **pdd); struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id); @@ -1065,8 +1091,6 @@ struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); /* PASIDs */ int kfd_pasid_init(void); void kfd_pasid_exit(void); -bool kfd_set_pasid_limit(unsigned int new_limit); -unsigned int kfd_get_pasid_limit(void); u32 kfd_pasid_alloc(void); void kfd_pasid_free(u32 pasid); @@ -1116,7 +1140,6 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock( uint32_t proximity_domain); struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id); struct kfd_node *kfd_device_by_id(uint32_t gpu_id); -struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev); static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id, uint32_t vmid) { @@ -1128,7 +1151,9 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev, struct kfd_dev *dev = adev->kfd.dev; uint32_t i; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 5, 0)) return dev->nodes[0]; for (i = 0; i < dev->num_nodes; i++) @@ -1282,6 +1307,15 @@ int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); +int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, + u64 expected_size); +void kfd_queue_buffer_put(struct amdgpu_bo **bo); +int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo); +int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, + struct queue_properties *properties); +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_node *dev); @@ -1293,12 +1327,15 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, struct kfd_node *dev); struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, struct kfd_node *dev); +struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, + struct kfd_node *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_node *dev, enum kfd_queue_type type); -void kernel_queue_uninit(struct kernel_queue *kq, bool hanging); -int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid); +void kernel_queue_uninit(struct kernel_queue *kq); +int kfd_evict_process_device(struct kfd_process_device *pdd); +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id); /* Process Queue Manager */ struct process_queue_node { @@ -1313,10 +1350,8 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); void pqm_uninit(struct process_queue_manager *pqm); int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_node *dev, - struct file *f, struct queue_properties *properties, unsigned int *qid, - struct amdgpu_bo *wptr_bo, const struct kfd_criu_queue_priv_data *q_data, const void *restore_mqd, const void *restore_ctl_stack, @@ -1328,8 +1363,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, unsigned int qid, struct mqd_update_info *minfo); int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws); -struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, - unsigned int qid); struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_get_wave_state(struct process_queue_manager *pqm, @@ -1356,6 +1389,24 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, #define KFD_FENCE_COMPLETED (100) #define KFD_FENCE_INIT (10) +/** + * enum kfd_config_dequeue_wait_counts_cmd - Command for configuring + * dequeue wait counts. + * + * @KFD_DEQUEUE_WAIT_INIT: Set optimized dequeue wait counts for a + * certain ASICs. For these ASICs, this is default value used by RESET + * @KFD_DEQUEUE_WAIT_RESET: Reset dequeue wait counts to the optimized value + * for certain ASICs. For others set it to default hardware reset value + * @KFD_DEQUEUE_WAIT_SET_SCH_WAVE: Set context switch latency wait + * + */ +enum kfd_config_dequeue_wait_counts_cmd { + KFD_DEQUEUE_WAIT_INIT = 1, + KFD_DEQUEUE_WAIT_RESET = 2, + KFD_DEQUEUE_WAIT_SET_SCH_WAVE = 3 +}; + + struct packet_manager { struct device_queue_manager *dqm; struct kernel_queue *priv_queue; @@ -1381,8 +1432,8 @@ struct packet_manager_funcs { int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, enum kfd_unmap_queues_filter mode, uint32_t filter_param, bool reset); - int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer, - uint32_t grace_period); + int (*config_dequeue_wait_counts)(struct packet_manager *pm, uint32_t *buffer, + enum kfd_config_dequeue_wait_counts_cmd cmd, uint32_t value); int (*query_status)(struct packet_manager *pm, uint32_t *buffer, uint64_t fence_address, uint64_t fence_value); int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); @@ -1393,7 +1444,7 @@ struct packet_manager_funcs { int set_resources_size; int map_queues_size; int unmap_queues_size; - int set_grace_period_size; + int config_dequeue_wait_counts_size; int query_status_size; int release_mem_size; }; @@ -1403,7 +1454,7 @@ extern const struct packet_manager_funcs kfd_v9_pm_funcs; extern const struct packet_manager_funcs kfd_aldebaran_pm_funcs; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm, bool hanging); +void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); @@ -1416,7 +1467,9 @@ int pm_send_unmap_queue(struct packet_manager *pm, void pm_release_ib(struct packet_manager *pm); -int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period); +int pm_config_dequeue_wait_counts(struct packet_manager *pm, + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t wait_counts_config); /* Following PM funcs can be shared among VI and AI */ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); @@ -1454,7 +1507,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, int kfd_get_num_events(struct kfd_process *p); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va); + +void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, struct kfd_vm_fault_info *info, struct kfd_hsa_memory_exception_data *data); @@ -1473,7 +1528,7 @@ static inline void kfd_flush_tlb(struct kfd_process_device *pdd, static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev) { - return KFD_GC_VERSION(dev) > IP_VERSION(9, 4, 2) || + return KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) || (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1) && dev->sdma_fw_version >= 18) || KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0); } @@ -1528,10 +1583,15 @@ int kfd_debugfs_hang_hws(struct kfd_node *dev); int pm_debugfs_hang_hws(struct packet_manager *pm); int dqm_debugfs_hang_hws(struct device_queue_manager *dqm); +void kfd_debugfs_add_process(struct kfd_process *p); +void kfd_debugfs_remove_process(struct kfd_process *p); + #else static inline void kfd_debugfs_init(void) {} static inline void kfd_debugfs_fini(void) {} +static inline void kfd_debugfs_add_process(struct kfd_process *p) {} +static inline void kfd_debugfs_remove_process(struct kfd_process *p) {} #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 717a60d7a4ea..722ac1662bdc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -35,6 +35,7 @@ #include <linux/pm_runtime.h> #include "amdgpu_amdkfd.h" #include "amdgpu.h" +#include "amdgpu_reset.h" struct mm_struct; @@ -270,6 +271,9 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) struct kfd_node *dev = NULL; struct kfd_process *proc = NULL; struct kfd_process_device *pdd = NULL; + int i; + struct kfd_cu_occupancy *cu_occupancy; + u32 queue_format; pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy); dev = pdd->dev; @@ -279,41 +283,64 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) cu_cnt = 0; proc = pdd->process; if (pdd->qpd.queue_count == 0) { - pr_debug("Gpu-Id: %d has no active queues for process %d\n", - dev->id, proc->pasid); + pr_debug("Gpu-Id: %d has no active queues for process pid %d\n", + dev->id, (int)proc->lead_thread->pid); return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); } /* Collect wave count from device if it supports */ wave_cnt = 0; max_waves_per_cu = 0; - dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt, - &max_waves_per_cu, 0); + + cu_occupancy = kcalloc(AMDGPU_MAX_QUEUES, sizeof(*cu_occupancy), GFP_KERNEL); + if (!cu_occupancy) + return -ENOMEM; + + /* + * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition. + * For AQL queues, because of cooperative dispatch we multiply the wave count + * by number of XCCs in the partition to get the total wave counts across all + * XCCs in the partition. + * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is. + */ + dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, + &max_waves_per_cu, ffs(dev->xcc_mask) - 1); + + for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { + if (cu_occupancy[i].wave_cnt != 0 && + kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, + cu_occupancy[i].doorbell_off, + &queue_format)) { + if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4)) + wave_cnt += cu_occupancy[i].wave_cnt; + else + wave_cnt += (NUM_XCC(dev->xcc_mask) * + cu_occupancy[i].wave_cnt); + } + } /* Translate wave count to number of compute units */ cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu; + kfree(cu_occupancy); return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); } static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - if (strcmp(attr->name, "pasid") == 0) { - struct kfd_process *p = container_of(attr, struct kfd_process, - attr_pasid); - - return snprintf(buffer, PAGE_SIZE, "%d\n", p->pasid); - } else if (strncmp(attr->name, "vram_", 5) == 0) { + if (strcmp(attr->name, "pasid") == 0) + return snprintf(buffer, PAGE_SIZE, "%d\n", 0); + else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); } else if (strncmp(attr->name, "sdma_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_sdma); struct kfd_sdma_activity_handler_workarea sdma_activity_work_handler; - INIT_WORK(&sdma_activity_work_handler.sdma_activity_work, - kfd_sdma_activity_worker); + INIT_WORK_ONSTACK(&sdma_activity_work_handler.sdma_activity_work, + kfd_sdma_activity_worker); sdma_activity_work_handler.pdd = pdd; sdma_activity_work_handler.sdma_activity_counter = 0; @@ -321,6 +348,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, schedule_work(&sdma_activity_work_handler.sdma_activity_work); flush_work(&sdma_activity_work_handler.sdma_activity_work); + destroy_work_on_stack(&sdma_activity_work_handler.sdma_activity_work); return snprintf(buffer, PAGE_SIZE, "%llu\n", (sdma_activity_work_handler.sdma_activity_counter)/ @@ -811,6 +839,14 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) return ERR_PTR(-EINVAL); } + /* If the process just called exec(3), it is possible that the + * cleanup of the kfd_process (following the release of the mm + * of the old process image) is still in the cleanup work queue. + * Make sure to drain any job before trying to recreate any + * resource for this process. + */ + flush_workqueue(kfd_process_wq); + /* * take kfd processes mutex before starting of process creation * so there won't be a case where two threads of the same process @@ -819,13 +855,15 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) mutex_lock(&kfd_processes_mutex); if (kfd_is_locked()) { - mutex_unlock(&kfd_processes_mutex); pr_debug("KFD is locked! Cannot create process"); - return ERR_PTR(-EINVAL); + process = ERR_PTR(-EINVAL); + goto out; } - /* A prior open of /dev/kfd could have already created the process. */ - process = find_process(thread, false); + /* A prior open of /dev/kfd could have already created the process. + * find_process will increase process kref in this case + */ + process = find_process(thread, true); if (process) { pr_debug("Process already found\n"); } else { @@ -862,11 +900,11 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) kfd_procfs_add_sysfs_files(process); kfd_procfs_add_sysfs_counters(process); + kfd_debugfs_add_process(process); + init_waitqueue_head(&process->wait_irq_drain); } out: - if (!IS_ERR(process)) - kref_get(&process->ref); mutex_unlock(&kfd_processes_mutex); mmput(thread->mm); @@ -1018,17 +1056,15 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; - pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n", - pdd->dev->id, p->pasid); + kfd_smi_event_process(pdd, false); + pr_debug("Releasing pdd (topology id %d, for pid %d)\n", + pdd->dev->id, p->lead_thread->pid); kfd_process_device_destroy_cwsr_dgpu(pdd); kfd_process_device_destroy_ib_mem(pdd); - if (pdd->drm_file) { - amdgpu_amdkfd_gpuvm_release_process_vm( - pdd->dev->adev, pdd->drm_priv); + if (pdd->drm_file) fput(pdd->drm_file); - } if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base) free_pages((unsigned long)pdd->qpd.cwsr_kaddr, @@ -1038,9 +1074,10 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) kfd_free_process_doorbells(pdd->dev->kfd, pdd); - if (pdd->dev->kfd->shared_resources.enable_mes) + if (pdd->dev->kfd->shared_resources.enable_mes && + pdd->proc_ctx_cpu_ptr) amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev, - pdd->proc_ctx_bo); + &pdd->proc_ctx_bo); /* * before destroying pdd, make sure to report availability * for auto suspend @@ -1101,6 +1138,17 @@ static void kfd_process_remove_sysfs(struct kfd_process *p) p->kobj = NULL; } +/* + * If any GPU is ongoing reset, wait for reset complete. + */ +static void kfd_process_wait_gpu_reset_complete(struct kfd_process *p) +{ + int i; + + for (i = 0; i < p->n_pdds; i++) + flush_workqueue(p->pdds[i]->dev->adev->reset_domain->wq); +} + /* No process locking is needed in this function, because the process * is not findable any more. We must assume that no other thread is * using it any more, otherwise we couldn't safely free the process @@ -1115,15 +1163,22 @@ static void kfd_process_wq_release(struct work_struct *work) kfd_process_dequeue_from_all_devices(p); pqm_uninit(&p->pqm); + /* + * If GPU in reset, user queues may still running, wait for reset complete. + */ + kfd_process_wait_gpu_reset_complete(p); + /* Signal the eviction fence after user mode queues are * destroyed. This allows any BOs to be freed without * triggering pointless evictions or waiting for fences. */ synchronize_rcu(); ef = rcu_access_pointer(p->ef); - dma_fence_signal(ef); + if (ef) + dma_fence_signal(ef); kfd_process_remove_sysfs(p); + kfd_debugfs_remove_process(p); kfd_process_kunmap_signal_bo(p); kfd_process_free_outstanding_kfd_bos(p); @@ -1134,7 +1189,6 @@ static void kfd_process_wq_release(struct work_struct *work) kfd_event_free_process(p); - kfd_pasid_free(p->pasid); mutex_destroy(&p->mutex); put_task_struct(p->lead_thread); @@ -1152,10 +1206,8 @@ static void kfd_process_ref_release(struct kref *ref) static struct mmu_notifier *kfd_process_alloc_notifier(struct mm_struct *mm) { - int idx = srcu_read_lock(&kfd_processes_srcu); - struct kfd_process *p = find_process_by_mm(mm); - - srcu_read_unlock(&kfd_processes_srcu, idx); + /* This increments p->ref counter if kfd process p exists */ + struct kfd_process *p = kfd_lookup_process_by_mm(mm); return p ? &p->mmu_notifier : ERR_PTR(-ESRCH); } @@ -1303,7 +1355,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) if (IS_ERR_VALUE(qpd->tba_addr)) { int err = qpd->tba_addr; - pr_err("Failure to set tba address. error %d.\n", err); + dev_err(dev->adev->dev, + "Failure to set tba address. error %d.\n", err); qpd->tba_addr = 0; qpd->cwsr_kaddr = NULL; return err; @@ -1486,12 +1539,6 @@ static struct kfd_process *create_process(const struct task_struct *thread) atomic_set(&process->debugged_process_count, 0); sema_init(&process->runtime_enable_sema, 0); - process->pasid = kfd_pasid_alloc(); - if (process->pasid == 0) { - err = -ENOSPC; - goto err_alloc_pasid; - } - err = pqm_init(&process->pqm, process); if (err != 0) goto err_process_pqm_init; @@ -1545,8 +1592,6 @@ err_init_svm_range_list: err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: - kfd_pasid_free(process->pasid); -err_alloc_pasid: kfd_event_free_process(process); err_event_init: mutex_destroy(&process->mutex); @@ -1571,7 +1616,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, struct kfd_process *p) { struct kfd_process_device *pdd = NULL; - int retval = 0; if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE)) return NULL; @@ -1590,25 +1634,11 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, pdd->bound = PDD_UNBOUND; pdd->already_dequeued = false; pdd->runtime_inuse = false; - pdd->vram_usage = 0; + atomic64_set(&pdd->vram_usage, 0); pdd->sdma_past_activity_counter = 0; pdd->user_gpu_id = dev->id; atomic64_set(&pdd->evict_duration_counter, 0); - if (dev->kfd->shared_resources.enable_mes) { - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, - AMDGPU_MES_PROC_CTX_SIZE, - &pdd->proc_ctx_bo, - &pdd->proc_ctx_gpu_addr, - &pdd->proc_ctx_cpu_ptr, - false); - if (retval) { - pr_err("failed to allocate process context bo\n"); - goto err_free_pdd; - } - memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); - } - p->pdds[p->n_pdds++] = pdd; if (kfd_dbg_is_per_vmid_supported(pdd->dev)) pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap( @@ -1620,10 +1650,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, idr_init(&pdd->alloc_idr); return pdd; - -err_free_pdd: - kfree(pdd); - return NULL; } /** @@ -1666,12 +1692,15 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm, &p->kgd_process_info, - &ef); + p->ef ? NULL : &ef); if (ret) { - pr_err("Failed to create process VM object\n"); + dev_err(dev->adev->dev, "Failed to create process VM object\n"); return ret; } - RCU_INIT_POINTER(p->ef, ef); + + if (!p->ef) + RCU_INIT_POINTER(p->ef, ef); + pdd->drm_priv = drm_file->private_data; ret = kfd_process_device_reserve_ib_mem(pdd); @@ -1681,15 +1710,21 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, if (ret) goto err_init_cwsr; - ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, avm, p->pasid); - if (ret) - goto err_set_pasid; + if (unlikely(!avm->pasid)) { + dev_warn(pdd->dev->adev->dev, "WARN: vm %p has no pasid associated", + avm); + ret = -EINVAL; + goto err_get_pasid; + } + pdd->pasid = avm->pasid; pdd->drm_file = drm_file; + kfd_smi_event_process(pdd, true); + return 0; -err_set_pasid: +err_get_pasid: kfd_process_device_destroy_cwsr_dgpu(pdd); err_init_cwsr: kfd_process_device_destroy_ib_mem(pdd); @@ -1715,7 +1750,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_node *dev, pdd = kfd_get_process_device_data(dev, p); if (!pdd) { - pr_err("Process device data doesn't exist\n"); + dev_err(dev->adev->dev, "Process device data doesn't exist\n"); return ERR_PTR(-ENOMEM); } @@ -1775,25 +1810,50 @@ void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, idr_remove(&pdd->alloc_idr, handle); } -/* This increments the process->ref counter. */ -struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid) +static struct kfd_process_device *kfd_lookup_process_device_by_pasid(u32 pasid) { - struct kfd_process *p, *ret_p = NULL; + struct kfd_process_device *ret_p = NULL; + struct kfd_process *p; unsigned int temp; - - int idx = srcu_read_lock(&kfd_processes_srcu); + int i; hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (p->pasid == pasid) { - kref_get(&p->ref); - ret_p = p; - break; + for (i = 0; i < p->n_pdds; i++) { + if (p->pdds[i]->pasid == pasid) { + ret_p = p->pdds[i]; + break; + } } + if (ret_p) + break; + } + return ret_p; +} + +/* This increments the process->ref counter. */ +struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid, + struct kfd_process_device **pdd) +{ + struct kfd_process_device *ret_p; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + ret_p = kfd_lookup_process_device_by_pasid(pasid); + if (ret_p) { + if (pdd) + *pdd = ret_p; + kref_get(&ret_p->process->ref); + + srcu_read_unlock(&kfd_processes_srcu, idx); + return ret_p->process; } srcu_read_unlock(&kfd_processes_srcu, idx); - return ret_p; + if (pdd) + *pdd = NULL; + + return NULL; } /* This increments the process->ref counter. */ @@ -1825,6 +1885,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + struct device *dev = pdd->dev->adev->dev; kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid, trigger); @@ -1836,10 +1897,12 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) * them been add back since they actually not be saved right now. */ if (r && r != -EIO) { - pr_err("Failed to evict process queues\n"); + dev_err(dev, "Failed to evict process queues\n"); goto fail; } n_evicted++; + + pdd->dev->dqm->is_hws_hang = false; } return r; @@ -1858,7 +1921,8 @@ fail: if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd)) - pr_err("Failed to restore queues\n"); + dev_err(pdd->dev->adev->dev, + "Failed to restore queues\n"); n_evicted--; } @@ -1874,13 +1938,14 @@ int kfd_process_restore_queues(struct kfd_process *p) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + struct device *dev = pdd->dev->adev->dev; kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid); r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd); if (r) { - pr_err("Failed to restore process queues\n"); + dev_err(dev, "Failed to restore process queues\n"); if (!ret) ret = r; } @@ -1922,6 +1987,8 @@ static int signal_eviction_fence(struct kfd_process *p) rcu_read_lock(); ef = dma_fence_get_rcu_safe(&p->ef); rcu_read_unlock(); + if (!ef) + return -EINVAL; ret = dma_fence_signal(ef); dma_fence_put(ef); @@ -1942,22 +2009,21 @@ static void evict_process_worker(struct work_struct *work) */ p = container_of(dwork, struct kfd_process, eviction_work); - pr_debug("Started evicting pasid 0x%x\n", p->pasid); + pr_debug("Started evicting process pid %d\n", p->lead_thread->pid); ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM); if (!ret) { /* If another thread already signaled the eviction fence, * they are responsible stopping the queues and scheduling * the restore work. */ - if (!signal_eviction_fence(p)) - queue_delayed_work(kfd_restore_wq, &p->restore_work, - msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - else + if (signal_eviction_fence(p) || + mod_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); - pr_debug("Finished evicting pasid 0x%x\n", p->pasid); + pr_debug("Finished evicting process pid %d\n", p->lead_thread->pid); } else - pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); + pr_err("Failed to evict queues of process pid %d\n", p->lead_thread->pid); } static int restore_process_helper(struct kfd_process *p) @@ -1974,9 +2040,11 @@ static int restore_process_helper(struct kfd_process *p) ret = kfd_process_restore_queues(p); if (!ret) - pr_debug("Finished restoring pasid 0x%x\n", p->pasid); + pr_debug("Finished restoring process pid %d\n", + p->lead_thread->pid); else - pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid); + pr_err("Failed to restore queues of process pid %d\n", + p->lead_thread->pid); return ret; } @@ -1993,7 +2061,7 @@ static void restore_process_worker(struct work_struct *work) * lifetime of this thread, kfd_process p will be valid */ p = container_of(dwork, struct kfd_process, restore_work); - pr_debug("Started restoring pasid 0x%x\n", p->pasid); + pr_debug("Started restoring process pasid %d\n", (int)p->lead_thread->pid); /* Setting last_restore_timestamp before successful restoration. * Otherwise this would have to be set by KGD (restore_process_bos) @@ -2009,11 +2077,11 @@ static void restore_process_worker(struct work_struct *work) ret = restore_process_helper(p); if (ret) { - pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", - p->pasid, PROCESS_BACK_OFF_TIME_MS); - ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, - msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); + pr_debug("Failed to restore BOs of process pid %d, retry after %d ms\n", + p->lead_thread->pid, PROCESS_BACK_OFF_TIME_MS); + if (mod_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) + kfd_process_restore_queues(p); } } @@ -2026,7 +2094,7 @@ void kfd_suspend_all_processes(void) WARN(debug_evictions, "Evicting all processes"); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND)) - pr_err("Failed to suspend process 0x%x\n", p->pasid); + pr_err("Failed to suspend process pid %d\n", p->lead_thread->pid); signal_eviction_fence(p); } srcu_read_unlock(&kfd_processes_srcu, idx); @@ -2040,8 +2108,8 @@ int kfd_resume_all_processes(void) hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (restore_process_helper(p)) { - pr_err("Restore process %d failed during resume\n", - p->pasid); + pr_err("Restore process pid %d failed during resume\n", + p->lead_thread->pid); ret = -EFAULT; } } @@ -2056,7 +2124,7 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, struct qcm_process_device *qpd; if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { - pr_err("Incorrect CWSR mapping size.\n"); + dev_err(dev->adev->dev, "Incorrect CWSR mapping size.\n"); return -EINVAL; } @@ -2068,7 +2136,8 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(KFD_CWSR_TBA_TMA_SIZE)); if (!qpd->cwsr_kaddr) { - pr_err("Error allocating per process CWSR buffer.\n"); + dev_err(dev->adev->dev, + "Error allocating per process CWSR buffer.\n"); return -ENOMEM; } @@ -2095,12 +2164,14 @@ int kfd_process_drain_interrupts(struct kfd_process_device *pdd) memset(irq_drain_fence, 0, sizeof(irq_drain_fence)); irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) | KFD_IRQ_FENCE_CLIENTID; - irq_drain_fence[3] = pdd->process->pasid; + irq_drain_fence[3] = pdd->pasid; /* - * For GFX 9.4.3, send the NodeId also in IH cookie DW[3] + * For GFX 9.4.3/9.5.0, send the NodeId also in IH cookie DW[3] */ - if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0)) { node_id = ffs(pdd->dev->interrupt_bitmap) - 1; irq_drain_fence[3] |= node_id << 16; } @@ -2124,7 +2195,7 @@ void kfd_process_close_interrupt_drain(unsigned int pasid) { struct kfd_process *p; - p = kfd_lookup_process_by_pasid(pasid); + p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; @@ -2245,8 +2316,8 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - seq_printf(m, "Process %d PASID 0x%x:\n", - p->lead_thread->tgid, p->pasid); + seq_printf(m, "Process %d PASID %d:\n", + p->lead_thread->tgid, p->lead_thread->pid); mutex_lock(&p->mutex); r = pqm_debugfs_mqds(m, &p->pqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 4858112f9a53..c643e0ccec52 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -28,6 +28,7 @@ #include "kfd_priv.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_reset.h" static inline struct process_queue_node *get_queue_by_qid( struct process_queue_manager *pqm, unsigned int qid) @@ -68,8 +69,8 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, pr_debug("The new slot id %lu\n", found); if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_info("Cannot open more queues for process with pasid 0x%x\n", - pqm->process->pasid); + pr_info("Cannot open more queues for process with pid %d\n", + pqm->process->lead_thread->pid); return -ENOMEM; } @@ -85,10 +86,17 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) if (pdd->already_dequeued) return; - + /* The MES context flush needs to filter out the case which the + * KFD process is created without setting up the MES context and + * queue for creating a compute queue. + */ dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); - if (dev->kfd->shared_resources.enable_mes) - amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr); + if (dev->kfd->shared_resources.enable_mes && !!pdd->proc_ctx_gpu_addr && + down_read_trylock(&dev->adev->reset_domain->sem)) { + amdgpu_mes_flush_shader_debugger(dev->adev, + pdd->proc_ctx_gpu_addr); + up_read(&dev->adev->reset_domain->sem); + } pdd->already_dequeued = true; } @@ -126,7 +134,10 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, if (!gws && pdd->qpd.num_gws == 0) return -EINVAL; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && !dev->kfd->shared_resources.enable_mes) { + if ((KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 5, 0)) && + !dev->kfd->shared_resources.enable_mes) { if (gws) ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info, gws, &mem); @@ -189,6 +200,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, if (pqn->q->gws) { if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 4) && + KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 5, 0) && !dev->kfd->shared_resources.enable_mes) amdgpu_amdkfd_remove_gws_from_process( pqm->process->kgd_process_info, pqn->q->gws); @@ -196,9 +209,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, } if (dev->kfd->shared_resources.enable_mes) { - amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->gang_ctx_bo); - if (pqn->q->wptr_bo) - amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &pqn->q->gang_ctx_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart); } } @@ -207,8 +219,17 @@ void pqm_uninit(struct process_queue_manager *pqm) struct process_queue_node *pqn, *next; list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { - if (pqn->q) + if (pqn->q) { + struct kfd_process_device *pdd = kfd_get_process_device_data(pqn->q->device, + pqm->process); + if (pdd) { + kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); + kfd_queue_release_buffers(pdd, &pqn->q->properties); + } else { + WARN_ON(!pdd); + } pqm_clean_queue_resource(pqm, pqn); + } kfd_procfs_del_queue(pqn->q); uninit_queue(pqn->q); @@ -223,7 +244,6 @@ void pqm_uninit(struct process_queue_manager *pqm) static int init_user_queue(struct process_queue_manager *pqm, struct kfd_node *dev, struct queue **q, struct queue_properties *q_properties, - struct file *f, struct amdgpu_bo *wptr_bo, unsigned int qid) { int retval; @@ -255,12 +275,29 @@ static int init_user_queue(struct process_queue_manager *pqm, goto cleanup; } memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE); - (*q)->wptr_bo = wptr_bo; + + /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work + * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) + */ + if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) { + pr_err("Queue memory allocated to wrong device\n"); + retval = -EINVAL; + goto free_gang_ctx_bo; + } + + retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo, + &(*q)->wptr_bo_gart); + if (retval) { + pr_err("Failed to map wptr bo to GART\n"); + goto free_gang_ctx_bo; + } } pr_debug("PQM After init queue"); return 0; +free_gang_ctx_bo: + amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo); cleanup: uninit_queue(*q); *q = NULL; @@ -269,10 +306,8 @@ cleanup: int pqm_create_queue(struct process_queue_manager *pqm, struct kfd_node *dev, - struct file *f, struct queue_properties *properties, unsigned int *qid, - struct amdgpu_bo *wptr_bo, const struct kfd_criu_queue_priv_data *q_data, const void *restore_mqd, const void *restore_ctl_stack, @@ -287,10 +322,12 @@ int pqm_create_queue(struct process_queue_manager *pqm, unsigned int max_queues = 127; /* HWS limit */ /* - * On GFX 9.4.3, increase the number of queues that - * can be created to 255. No HWS limit on GFX 9.4.3. + * On GFX 9.4.3/9.5.0, increase the number of queues that + * can be created to 255. No HWS limit on GFX 9.4.3/9.5.0. */ - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) max_queues = 255; q = NULL; @@ -323,10 +360,26 @@ int pqm_create_queue(struct process_queue_manager *pqm, if (retval != 0) return retval; + /* Register process if this is the first queue */ if (list_empty(&pdd->qpd.queues_list) && list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); + /* Allocate proc_ctx_bo only if MES is enabled and this is the first queue */ + if (!pdd->proc_ctx_cpu_ptr && dev->kfd->shared_resources.enable_mes) { + retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (retval) { + dev_err(dev->adev->dev, "failed to allocate process context bo\n"); + return retval; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); if (!pqn) { retval = -ENOMEM; @@ -336,13 +389,14 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: + case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: /* SDMA queues are always allocated statically no matter * which scheduler mode is used. We also do not need to * check whether a SDMA queue can be allocated here, because * allocate_sdma_queue() in create_queue() has the * corresponding check logic. */ - retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid); + retval = init_user_queue(pqm, dev, &q, properties, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; @@ -363,7 +417,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, goto err_create_queue; } - retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid); + retval = init_user_queue(pqm, dev, &q, properties, *qid); if (retval != 0) goto err_create_queue; pqn->q = q; @@ -394,8 +448,15 @@ int pqm_create_queue(struct process_queue_manager *pqm, } if (retval != 0) { - pr_err("Pasid 0x%x DQM create queue type %d failed. ret %d\n", - pqm->process->pasid, type, retval); + if ((type == KFD_QUEUE_TYPE_SDMA || + type == KFD_QUEUE_TYPE_SDMA_XGMI || + type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) && + retval == -ENOMEM) + pr_warn("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); + else + pr_err("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); goto err_create_queue; } @@ -430,7 +491,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, err_create_queue: uninit_queue(q); if (kq) - kernel_queue_uninit(kq, false); + kernel_queue_uninit(kq); kfree(pqn); err_allocate_pqn: /* check if queues list is empty unregister process from device */ @@ -477,21 +538,25 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) /* destroy kernel queue (DIQ) */ dqm = pqn->kq->dev->dqm; dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); - kernel_queue_uninit(pqn->kq, false); + kernel_queue_uninit(pqn->kq); } if (pqn->q) { - kfd_procfs_del_queue(pqn->q); + retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); + if (retval) + goto err_destroy_queue; + dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n", - pqm->process->pasid, + pdd->pasid, pqn->q->properties.queue_id, retval); - if (retval != -ETIME) + if (retval != -ETIME && retval != -EIO) goto err_destroy_queue; } - + kfd_procfs_del_queue(pqn->q); + kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); uninit_queue(pqn->q); } @@ -515,11 +580,42 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, struct process_queue_node *pqn; pqn = get_queue_by_qid(pqm, qid); - if (!pqn) { + if (!pqn || !pqn->q) { pr_debug("No queue %d exists for update operation\n", qid); return -EFAULT; } + /* + * Update with NULL ring address is used to disable the queue + */ + if (p->queue_address && p->queue_size) { + struct kfd_process_device *pdd; + struct amdgpu_vm *vm; + struct queue *q = pqn->q; + int err; + + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) + return -ENODEV; + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo, + p->queue_size)) { + pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n", + p->queue_address, p->queue_size); + return -EFAULT; + } + + kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo); + kfd_queue_buffer_put(&pqn->q->properties.ring_bo); + amdgpu_bo_unreserve(vm->root.bo); + + pqn->q->properties.ring_bo = p->ring_bo; + } + pqn->q->properties.queue_address = p->queue_address; pqn->q->properties.queue_size = p->queue_size; pqn->q->properties.queue_percent = p->queue_percent; @@ -576,19 +672,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, return 0; } -struct kernel_queue *pqm_get_kernel_queue( - struct process_queue_manager *pqm, - unsigned int qid) -{ - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (pqn && pqn->kq) - return pqn->kq; - - return NULL; -} - struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, unsigned int qid) { @@ -962,8 +1045,7 @@ int kfd_criu_restore_queue(struct kfd_process *p, print_queue_properties(&qp); - ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL, q_data, mqd, ctl_stack, - NULL); + ret = pqm_create_queue(&p->pqm, pdd->dev, &qp, &queue_id, q_data, mqd, ctl_stack, NULL); if (ret) { pr_err("Failed to create new queue err:%d\n", ret); goto exit; @@ -979,6 +1061,7 @@ exit: pr_debug("Queue id %d was restored successfully\n", queue_id); kfree(q_data); + kfree(q_extra_data); return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 0f6992b1895c..a65c67cf56ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -24,6 +24,8 @@ #include <linux/slab.h> #include "kfd_priv.h" +#include "kfd_topology.h" +#include "kfd_svm.h" void print_queue_properties(struct queue_properties *q) { @@ -82,3 +84,386 @@ void uninit_queue(struct queue *q) { kfree(q); } + +#if IS_ENABLED(CONFIG_HSA_AMD_SVM) + +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct list_head update_list; + struct svm_range *prange; + int ret = -EINVAL; + + INIT_LIST_HEAD(&update_list); + addr >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + + mutex_lock(&p->svms.lock); + + /* + * range may split to multiple svm pranges aligned to granularity boundaery. + */ + while (size) { + uint32_t gpuid, gpuidx; + int r; + + prange = svm_range_from_addr(&p->svms, addr, NULL); + if (!prange) + break; + + if (!prange->mapped_to_gpu) + break; + + r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx); + if (r < 0) + break; + if (!test_bit(gpuidx, prange->bitmap_access) && + !test_bit(gpuidx, prange->bitmap_aip)) + break; + + if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) + break; + + list_add(&prange->update_list, &update_list); + + if (prange->last - prange->start + 1 >= size) { + size = 0; + break; + } + + size -= prange->last - prange->start + 1; + addr += prange->last - prange->start + 1; + } + if (size) { + pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1); + goto out_unlock; + } + + list_for_each_entry(prange, &update_list, update_list) + atomic_inc(&prange->queue_refcount); + ret = 0; + +out_unlock: + mutex_unlock(&p->svms.lock); + return ret; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + struct kfd_process *p = pdd->process; + struct svm_range *prange, *pchild; + struct interval_tree_node *node; + unsigned long last; + + addr >>= PAGE_SHIFT; + last = addr + (size >> PAGE_SHIFT) - 1; + + mutex_lock(&p->svms.lock); + + node = interval_tree_iter_first(&p->svms.objects, addr, last); + while (node) { + struct interval_tree_node *next_node; + unsigned long next_start; + + prange = container_of(node, struct svm_range, it_node); + next_node = interval_tree_iter_next(node, addr, last); + next_start = min(node->last, last) + 1; + + if (atomic_add_unless(&prange->queue_refcount, -1, 0)) { + list_for_each_entry(pchild, &prange->child_list, child_list) + atomic_add_unless(&pchild->queue_refcount, -1, 0); + } + + node = next_node; + addr = next_start; + } + + mutex_unlock(&p->svms.lock); +} +#else + +static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size) +{ + return -EINVAL; +} + +static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size) +{ +} + +#endif + +int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, + u64 expected_size) +{ + struct amdgpu_bo_va_mapping *mapping; + u64 user_addr; + u64 size; + + user_addr = (u64)addr >> AMDGPU_GPU_PAGE_SHIFT; + size = expected_size >> AMDGPU_GPU_PAGE_SHIFT; + + mapping = amdgpu_vm_bo_lookup_mapping(vm, user_addr); + if (!mapping) + goto out_err; + + if (user_addr != mapping->start || + (size != 0 && user_addr + size - 1 != mapping->last)) { + pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n", + expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT, + (mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT); + goto out_err; + } + + *pbo = amdgpu_bo_ref(mapping->bo_va->base.bo); + mapping->bo_va->queue_refcount++; + return 0; + +out_err: + *pbo = NULL; + return -EINVAL; +} + +/* FIXME: remove this function, just call amdgpu_bo_unref directly */ +void kfd_queue_buffer_put(struct amdgpu_bo **bo) +{ + amdgpu_bo_unref(bo); +} + +int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) +{ + struct kfd_topology_device *topo_dev; + u64 expected_queue_size; + struct amdgpu_vm *vm; + u32 total_cwsr_size; + int err; + + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + + /* AQL queues on GFX7 and GFX8 appear twice their actual size */ + if (properties->type == KFD_QUEUE_TYPE_COMPUTE && + properties->format == KFD_QUEUE_FORMAT_AQL && + topo_dev->node_props.gfx_target_version >= 70000 && + topo_dev->node_props.gfx_target_version < 90000) + expected_queue_size = properties->queue_size / 2; + else + expected_queue_size = properties->queue_size; + + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE); + if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE); + if (err) + goto out_err_unreserve; + + err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, + &properties->ring_bo, expected_queue_size); + if (err) + goto out_err_unreserve; + + /* only compute queue requires EOP buffer and CWSR area */ + if (properties->type != KFD_QUEUE_TYPE_COMPUTE) + goto out_unreserve; + + /* EOP buffer is not required for all ASICs */ + if (properties->eop_ring_buffer_address) { + if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { + pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n", + properties->eop_ring_buffer_size, + topo_dev->node_props.eop_buffer_size); + err = -EINVAL; + goto out_err_unreserve; + } + err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, + &properties->eop_buf_bo, + properties->eop_ring_buffer_size); + if (err) + goto out_err_unreserve; + } + + if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) { + pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n", + properties->ctl_stack_size, + topo_dev->node_props.ctl_stack_size); + err = -EINVAL; + goto out_err_unreserve; + } + + if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + properties->ctx_save_restore_area_size, + topo_dev->node_props.cwsr_size); + err = -EINVAL; + goto out_err_unreserve; + } + + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + + err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, + &properties->cwsr_bo, total_cwsr_size); + if (!err) + goto out_unreserve; + + amdgpu_bo_unreserve(vm->root.bo); + + err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address, + total_cwsr_size); + if (err) + goto out_err_release; + + return 0; + +out_unreserve: + amdgpu_bo_unreserve(vm->root.bo); + return 0; + +out_err_unreserve: + amdgpu_bo_unreserve(vm->root.bo); +out_err_release: + /* FIXME: make a _locked version of this that can be called before + * dropping the VM reservation. + */ + kfd_queue_unref_bo_vas(pdd, properties); + kfd_queue_release_buffers(pdd, properties); + return err; +} + +int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) +{ + struct kfd_topology_device *topo_dev; + u32 total_cwsr_size; + + kfd_queue_buffer_put(&properties->wptr_bo); + kfd_queue_buffer_put(&properties->rptr_bo); + kfd_queue_buffer_put(&properties->ring_bo); + kfd_queue_buffer_put(&properties->eop_buf_bo); + kfd_queue_buffer_put(&properties->cwsr_bo); + + topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) + return -EINVAL; + total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) + * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); + + kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); + return 0; +} + +void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo) +{ + if (*bo) { + struct amdgpu_bo_va *bo_va; + + bo_va = amdgpu_vm_bo_find(vm, *bo); + if (bo_va && bo_va->queue_refcount) + bo_va->queue_refcount--; + } +} + +int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, + struct queue_properties *properties) +{ + struct amdgpu_vm *vm; + int err; + + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + kfd_queue_unref_bo_va(vm, &properties->wptr_bo); + kfd_queue_unref_bo_va(vm, &properties->rptr_bo); + kfd_queue_unref_bo_va(vm, &properties->ring_bo); + kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo); + kfd_queue_unref_bo_va(vm, &properties->cwsr_bo); + + amdgpu_bo_unreserve(vm->root.bo); + return 0; +} + +#define SGPR_SIZE_PER_CU 0x4000 +#define LDS_SIZE_PER_CU 0x10000 +#define HWREG_SIZE_PER_CU 0x1000 +#define DEBUGGER_BYTES_ALIGN 64 +#define DEBUGGER_BYTES_PER_WAVE 32 + +static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) +{ + u32 vgpr_size = 0x40000; + + if (gfxv == 90402 || /* GFX_VERSION_AQUA_VANJARAM */ + gfxv == 90010 || /* GFX_VERSION_ALDEBARAN */ + gfxv == 90008 || /* GFX_VERSION_ARCTURUS */ + gfxv == 90500) + vgpr_size = 0x80000; + else if (gfxv == 110000 || /* GFX_VERSION_PLUM_BONITO */ + gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ + gfxv == 120000 || /* GFX_VERSION_GFX1200 */ + gfxv == 120001) /* GFX_VERSION_GFX1201 */ + vgpr_size = 0x60000; + + return vgpr_size; +} + +#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props) \ + (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\ + (((gfxv) == 90500) ? (props->lds_size_in_kb << 10) : LDS_SIZE_PER_CU) +\ + HWREG_SIZE_PER_CU) + +#define CNTL_STACK_BYTES_PER_WAVE(gfxv) \ + ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/ + +#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40 + +void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) +{ + struct kfd_node_properties *props = &dev->node_props; + u32 gfxv = props->gfx_target_version; + u32 ctl_stack_size; + u32 wg_data_size; + u32 wave_num; + u32 cu_num; + + if (gfxv < 80001) /* GFX_VERSION_CARRIZO */ + return; + + cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); + wave_num = (gfxv < 100100) ? /* GFX_VERSION_NAVI10 */ + min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512) + : cu_num * 32; + + wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE); + ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; + ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, + PAGE_SIZE); + + if ((gfxv / 10000 * 10000) == 100000) { + /* HW design limits control stack size to 0x7000. + * This is insufficient for theoretical PM4 cases + * but sufficient for AQL, limited by SPI events. + */ + ctl_stack_size = min(ctl_stack_size, 0x7000); + } + + props->ctl_stack_size = ctl_stack_size; + props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); + props->cwsr_size = ctl_stack_size + wg_data_size; + + if (gfxv == 80002) /* GFX_VERSION_TONGA */ + props->eop_buffer_size = 0x8000; + else if (gfxv == 90402) /* GFX_VERSION_AQUA_VANJARAM */ + props->eop_buffer_size = 4096; + else if (gfxv >= 80000) + props->eop_buffer_size = 4096; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 06ac835190f9..83d9384ac815 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -29,6 +29,7 @@ #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" +#include "amdgpu_reset.h" struct kfd_smi_client { struct list_head list; @@ -43,7 +44,7 @@ struct kfd_smi_client { bool suser; }; -#define MAX_KFIFO_SIZE 1024 +#define KFD_MAX_KFIFO_SIZE 8192 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); @@ -85,7 +86,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, struct kfd_smi_client *client = filep->private_data; unsigned char *buf; - size = min_t(size_t, size, MAX_KFIFO_SIZE); + size = min_t(size_t, size, KFD_MAX_KFIFO_SIZE); buf = kmalloc(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -162,10 +163,9 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) static bool kfd_smi_ev_enabled(pid_t pid, struct kfd_smi_client *client, unsigned int event) { - uint64_t all = KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS); uint64_t events = READ_ONCE(client->events); - if (pid && client->pid != pid && !(client->suser && (events & all))) + if (pid && client->pid != pid && !client->suser) return false; return events & KFD_SMI_EVENT_MASK_FROM_INDEX(event); @@ -215,9 +215,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, add_event_to_kfifo(pid, dev, event, fifo_in, len); } -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context) { unsigned int event; + char reset_cause[64]; if (post_reset) { event = KFD_SMI_EVENT_GPU_POST_RESET; @@ -225,15 +227,23 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + + memset(reset_cause, 0, sizeof(reset_cause)); + + if (reset_context) + amdgpu_reset_get_desc(reset_context, reset_cause, + sizeof(reset_cause)); + + kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( + dev->reset_seq_num, reset_cause)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING( throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -244,8 +254,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (task_info) { /* Report VM faults from user applications, not retry from kernel */ if (task_info->pid) - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info->pid, task_info->task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( + task_info->pid, task_info->task_name)); amdgpu_vm_put_task_info(task_info); } } @@ -255,16 +265,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -274,34 +284,35 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", + KFD_EVENT_FMT_MIGRATE_START( ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + from, to, prefetch_loc, preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, - uint32_t from, uint32_t to, uint32_t trigger) + uint32_t from, uint32_t to, uint32_t trigger, + int error_code) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", + KFD_EVENT_FMT_MIGRATE_END( ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); + from, to, trigger, error_code)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION, - "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid, - node->id, trigger); + KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid, + node->id, trigger)); } void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE, - "%lld -%d %x\n", ktime_get_boottime_ns(), pid, - node->id); + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid, + node->id, 0)); } void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) @@ -318,8 +329,8 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) kfd_smi_event_add(p->lead_thread->pid, pdd->dev, KFD_SMI_EVENT_QUEUE_RESTORE, - "%lld -%d %x %c\n", ktime_get_boottime_ns(), - p->lead_thread->pid, pdd->dev->id, 'R'); + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), + p->lead_thread->pid, pdd->dev->id, 'R')); } kfd_unref_process(p); } @@ -329,8 +340,29 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU, - "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(), - pid, address, last - address + 1, node->id, trigger); + KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(), + pid, address, last - address + 1, node->id, trigger)); +} + +void kfd_smi_event_process(struct kfd_process_device *pdd, bool start) +{ + struct amdgpu_task_info *task_info; + struct amdgpu_vm *avm; + + if (!pdd->drm_priv) + return; + + avm = drm_priv_to_vm(pdd->drm_priv); + task_info = amdgpu_vm_get_task_info_vm(avm); + + if (task_info) { + kfd_smi_event_add(0, pdd->dev, + start ? KFD_SMI_EVENT_PROCESS_START : + KFD_SMI_EVENT_PROCESS_END, + KFD_EVENT_FMT_PROCESS(task_info->pid, + task_info->task_name)); + amdgpu_vm_put_task_info(task_info); + } } int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) @@ -343,7 +375,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) return -ENOMEM; INIT_LIST_HEAD(&client->list); - ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + ret = kfifo_alloc(&client->fifo, KFD_MAX_KFIFO_SIZE, GFP_KERNEL); if (ret) { kfree(client); return ret; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index fa95c2dfd587..bb4d72b57387 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -24,11 +24,14 @@ #ifndef KFD_SMI_EVENTS_H_INCLUDED #define KFD_SMI_EVENTS_H_INCLUDED +struct amdgpu_reset_context; + int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask); -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset); +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context); void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, unsigned long address, bool write_fault, ktime_t ts); @@ -41,7 +44,8 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, - uint32_t from, uint32_t to, uint32_t trigger); + uint32_t from, uint32_t to, uint32_t trigger, + int error_code); void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid); @@ -49,4 +53,5 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm); void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, unsigned long address, unsigned long last, uint32_t trigger); +void kfd_smi_event_process(struct kfd_process_device *pdd, bool start); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f0f7f48af413..a0f22ea6d15a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -309,12 +309,13 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) } static void -svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, - uint8_t *granularity, uint32_t *flags) +svm_range_set_default_attributes(struct svm_range_list *svms, int32_t *location, + int32_t *prefetch_loc, uint8_t *granularity, + uint32_t *flags) { *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; - *granularity = 9; + *granularity = svms->default_granularity; *flags = KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; } @@ -358,7 +359,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); - svm_range_set_default_attributes(&prange->preferred_loc, + svm_range_set_default_attributes(svms, &prange->preferred_loc, &prange->prefetch_loc, &prange->granularity, &prange->flags); @@ -404,6 +405,27 @@ static void svm_range_bo_release(struct kref *kref) spin_lock(&svm_bo->list_lock); } spin_unlock(&svm_bo->list_lock); + + if (mmget_not_zero(svm_bo->eviction_fence->mm)) { + struct kfd_process_device *pdd; + struct kfd_process *p; + struct mm_struct *mm; + + mm = svm_bo->eviction_fence->mm; + /* + * The forked child process takes svm_bo device pages ref, svm_bo could be + * released after parent process is gone. + */ + p = kfd_lookup_process_by_mm(mm); + if (p) { + pdd = kfd_get_process_device_data(svm_bo->node, p); + if (pdd) + atomic64_sub(amdgpu_bo_size(svm_bo->bo), &pdd->vram_usage); + kfd_unref_process(p); + } + mmput(mm); + } + if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) /* We're not in the eviction worker. Signal the fence. */ dma_fence_signal(&svm_bo->eviction_fence->base); @@ -531,6 +553,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear) { + struct kfd_process_device *pdd; struct amdgpu_bo_param bp; struct svm_range_bo *svm_bo; struct amdgpu_bo_user *ubo; @@ -540,7 +563,8 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, int r; p = container_of(prange->svms, struct kfd_process, svms); - pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, + pr_debug("process pid: %d svms 0x%p [0x%lx 0x%lx]\n", + p->lead_thread->pid, prange->svms, prange->start, prange->last); if (svm_range_validate_svm_bo(node, prange)) @@ -622,6 +646,10 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, list_add(&prange->svm_bo_list, &svm_bo->range_list); spin_unlock(&svm_bo->list_lock); + pdd = svm_range_get_pdd_by_node(prange, node); + if (pdd) + atomic64_add(amdgpu_bo_size(bo), &pdd->vram_usage); + return 0; reserve_bo_failed: @@ -1051,6 +1079,7 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, new->mapped_to_gpu = old->mapped_to_gpu; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); + atomic_set(&new->queue_refcount, atomic_read(&old->queue_refcount)); return 0; } @@ -1142,13 +1171,12 @@ svm_range_split_head(struct svm_range *prange, uint64_t new_start, } static void -svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, - struct svm_range *pchild, enum svm_work_list_ops op) +svm_range_add_child(struct svm_range *prange, struct svm_range *pchild, enum svm_work_list_ops op) { pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", pchild, pchild->start, pchild->last, prange, op); - pchild->work_item.mm = mm; + pchild->work_item.mm = NULL; pchild->work_item.op = op; list_add_tail(&pchild->child_list, &prange->child_list); } @@ -1167,17 +1195,17 @@ svm_range_get_pte_flags(struct kfd_node *node, struct kfd_node *bo_node; uint32_t flags = prange->flags; uint32_t mapping_flags = 0; + uint32_t gc_ip_version = KFD_GC_VERSION(node); uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; - bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) bo_node = prange->svm_bo->node; - switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) { + switch (gc_ip_version) { case IP_VERSION(9, 4, 1): if (domain == SVM_RANGE_VRAM_DOMAIN) { if (bo_node == node) { @@ -1213,15 +1241,15 @@ svm_range_get_pte_flags(struct kfd_node *node, } break; case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + case IP_VERSION(9, 5, 0): if (ext_coherent) - mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_UC; + mtype_local = AMDGPU_VM_MTYPE_CC; else mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; snoop = true; - if (uncached) { - mapping_flags |= AMDGPU_VM_MTYPE_UC; - } else if (domain == SVM_RANGE_VRAM_DOMAIN) { + if (domain == SVM_RANGE_VRAM_DOMAIN) { /* local HBM region close to partition */ if (bo_node->adev == node->adev && (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) @@ -1231,9 +1259,13 @@ svm_range_get_pte_flags(struct kfd_node *node, */ else if (svm_nodes_in_same_hive(bo_node, node) && !ext_coherent) mapping_flags |= AMDGPU_VM_MTYPE_NC; - /* PCIe P2P or extended system scope coherence */ - else + /* PCIe P2P on GPUs pre-9.5.0 */ + else if (gc_ip_version < IP_VERSION(9, 5, 0) && + !svm_nodes_in_same_hive(bo_node, node)) mapping_flags |= AMDGPU_VM_MTYPE_UC; + /* Other remote memory */ + else + mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; /* system memory accessed by the APU */ } else if (node->adev->flags & AMD_IS_APU) { /* On NUMA systems, locality is determined per-page @@ -1245,9 +1277,16 @@ svm_range_get_pte_flags(struct kfd_node *node, mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; /* system memory accessed by the dGPU */ } else { - mapping_flags |= AMDGPU_VM_MTYPE_UC; + if (gc_ip_version < IP_VERSION(9, 5, 0) || ext_coherent) + mapping_flags |= AMDGPU_VM_MTYPE_UC; + else + mapping_flags |= AMDGPU_VM_MTYPE_NC; } break; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + mapping_flags |= AMDGPU_VM_MTYPE_NC; + break; default: mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; @@ -1263,6 +1302,8 @@ svm_range_get_pte_flags(struct kfd_node *node, pte_flags = AMDGPU_PTE_VALID; pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; + if (gc_ip_version >= IP_VERSION(12, 0, 0)) + pte_flags |= AMDGPU_PTE_IS_PTE; pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags); return pte_flags; @@ -1658,7 +1699,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1678,11 +1719,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, readonly, owner, NULL, &hmm_range); WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { + if (r) pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - } } else { r = -EFAULT; } @@ -1696,7 +1734,12 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + /* Free backing memory of hmm_range if it was initialized + * Overrride return value to TRY AGAIN only if prior returns + * were successful + */ + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; } @@ -1980,6 +2023,7 @@ static struct svm_range *svm_range_clone(struct svm_range *old) new->vram_pages = old->vram_pages; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); + atomic_set(&new->queue_refcount, atomic_read(&old->queue_refcount)); return new; } @@ -2248,16 +2292,10 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms) { struct kfd_process_device *pdd; struct kfd_process *p; - int drain; uint32_t i; p = container_of(svms, struct kfd_process, svms); -restart: - drain = atomic_read(&svms->drain_pagefaults); - if (!drain) - return; - for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { pdd = p->pdds[i]; if (!pdd) @@ -2277,8 +2315,6 @@ restart: pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); } - if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) - goto restart; } static void svm_range_deferred_list_work(struct work_struct *work) @@ -2300,17 +2336,8 @@ static void svm_range_deferred_list_work(struct work_struct *work) prange->start, prange->last, prange->work_item.op); mm = prange->work_item.mm; -retry: - mmap_write_lock(mm); - /* Checking for the need to drain retry faults must be inside - * mmap write lock to serialize with munmap notifiers. - */ - if (unlikely(atomic_read(&svms->drain_pagefaults))) { - mmap_write_unlock(mm); - svm_range_drain_retry_fault(svms); - goto retry; - } + mmap_write_lock(mm); /* Remove from deferred_list must be inside mmap write lock, for * two race cases: @@ -2366,15 +2393,17 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, prange->work_item.op != SVM_OP_UNMAP_RANGE) prange->work_item.op = op; } else { - prange->work_item.op = op; - - /* Pairs with mmput in deferred_list_work */ - mmget(mm); - prange->work_item.mm = mm; - list_add_tail(&prange->deferred_list, - &prange->svms->deferred_range_list); - pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", - prange, prange->start, prange->last, op); + /* Pairs with mmput in deferred_list_work. + * If process is exiting and mm is gone, don't update mmu notifier. + */ + if (mmget_not_zero(mm)) { + prange->work_item.mm = mm; + prange->work_item.op = op; + list_add_tail(&prange->deferred_list, + &prange->svms->deferred_range_list); + pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", + prange, prange->start, prange->last, op); + } } spin_unlock(&svms->deferred_list_lock); } @@ -2388,8 +2417,7 @@ void schedule_deferred_list_work(struct svm_range_list *svms) } static void -svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, - struct svm_range *prange, unsigned long start, +svm_range_unmap_split(struct svm_range *parent, struct svm_range *prange, unsigned long start, unsigned long last) { struct svm_range *head; @@ -2410,12 +2438,12 @@ svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, svm_range_split(tail, last + 1, tail->last, &head); if (head != prange && tail != prange) { - svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); - svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); + svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE); + svm_range_add_child(parent, tail, SVM_OP_ADD_RANGE); } else if (tail != prange) { - svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); + svm_range_add_child(parent, tail, SVM_OP_UNMAP_RANGE); } else if (head != prange) { - svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); + svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE); } else if (parent != prange) { prange->work_item.op = SVM_OP_UNMAP_RANGE; } @@ -2431,6 +2459,17 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, struct kfd_process *p; unsigned long s, l; bool unmap_parent; + uint32_t i; + + if (atomic_read(&prange->queue_refcount)) { + int r; + + pr_warn("Freeing queue vital buffer 0x%lx, queue evicted\n", + prange->start << PAGE_SHIFT); + r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM); + if (r) + pr_debug("failed %d to quiesce KFD queues\n", r); + } p = kfd_lookup_process_by_mm(mm); if (!p) @@ -2440,11 +2479,38 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, prange, prange->start, prange->last, start, last); - /* Make sure pending page faults are drained in the deferred worker - * before the range is freed to avoid straggler interrupts on - * unmapped memory causing "phantom faults". + /* calculate time stamps that are used to decide which page faults need be + * dropped or handled before unmap pages from gpu vm */ - atomic_inc(&svms->drain_pagefaults); + for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { + struct kfd_process_device *pdd; + struct amdgpu_device *adev; + struct amdgpu_ih_ring *ih; + uint32_t checkpoint_wptr; + + pdd = p->pdds[i]; + if (!pdd) + continue; + + adev = pdd->dev->adev; + + /* Check and drain ih1 ring if cam not available */ + if (adev->irq.ih1.ring_size) { + ih = &adev->irq.ih1; + checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); + if (ih->rptr != checkpoint_wptr) { + svms->checkpoint_ts[i] = + amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1); + continue; + } + } + + /* check if dev->irq.ih_soft is not empty */ + ih = &adev->irq.ih_soft; + checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); + if (ih->rptr != checkpoint_wptr) + svms->checkpoint_ts[i] = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1); + } unmap_parent = start <= prange->start && last >= prange->last; @@ -2454,14 +2520,14 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, l = min(last, pchild->last); if (l >= s) svm_range_unmap_from_gpus(pchild, s, l, trigger); - svm_range_unmap_split(mm, prange, pchild, start, last); + svm_range_unmap_split(prange, pchild, start, last); mutex_unlock(&pchild->lock); } s = max(start, prange->start); l = min(last, prange->last); if (l >= s) svm_range_unmap_from_gpus(prange, s, l, trigger); - svm_range_unmap_split(mm, prange, prange, start, last); + svm_range_unmap_split(prange, prange, start, last); if (unmap_parent) svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); @@ -2504,8 +2570,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, if (range->event == MMU_NOTIFY_RELEASE) return true; - if (!mmget_not_zero(mni->mm)) - return true; start = mni->interval_tree.start; last = mni->interval_tree.last; @@ -2532,7 +2596,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, } svm_range_unlock(prange); - mmput(mni->mm); return true; } @@ -2619,7 +2682,7 @@ svm_range_best_restore_location(struct svm_range *prange, return -1; } - if (node->adev->gmc.is_app_apu) + if (node->adev->apu_prefer_gtt) return 0; if (prange->preferred_loc == gpuid || @@ -2668,9 +2731,10 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); start_limit = max(vma->vm_start >> PAGE_SHIFT, - (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); + (unsigned long)ALIGN_DOWN(addr, 1UL << p->svms.default_granularity)); end_limit = min(vma->vm_end >> PAGE_SHIFT, - (unsigned long)ALIGN(addr + 1, 2UL << 8)); + (unsigned long)ALIGN(addr + 1, 1UL << p->svms.default_granularity)); + /* First range that starts after the fault address */ node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); if (node) { @@ -2885,7 +2949,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t vmid, uint32_t node_id, - uint64_t addr, bool write_fault) + uint64_t addr, uint64_t ts, bool write_fault) { unsigned long start, last, size; struct mm_struct *mm = NULL; @@ -2895,7 +2959,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, ktime_t timestamp = ktime_get_boottime(); struct kfd_node *node; int32_t best_loc; - int32_t gpuidx = MAX_GPU_INSTANCE; + int32_t gpuid, gpuidx = MAX_GPU_INSTANCE; bool write_locked = false; struct vm_area_struct *vma; bool migration = false; @@ -2906,7 +2970,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, return -EFAULT; } - p = kfd_lookup_process_by_pasid(pasid); + p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) { pr_debug("kfd process not founded pasid 0x%x\n", pasid); return 0; @@ -2916,11 +2980,25 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); if (atomic_read(&svms->drain_pagefaults)) { - pr_debug("draining retry fault, drop fault 0x%llx\n", addr); + pr_debug("page fault handling disabled, drop fault 0x%llx\n", addr); r = 0; goto out; } + node = kfd_node_by_irq_ids(adev, node_id, vmid); + if (!node) { + pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id, + vmid); + r = -EFAULT; + goto out; + } + + if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) { + pr_debug("failed to get gpuid/gpuidex for node_id: %d\n", node_id); + r = -EFAULT; + goto out; + } + if (!p->xnack_enabled) { pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); r = -EFAULT; @@ -2937,16 +3015,24 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out; } - node = kfd_node_by_irq_ids(adev, node_id, vmid); - if (!node) { - pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id, - vmid); - r = -EFAULT; - goto out; - } mmap_read_lock(mm); retry_write_locked: mutex_lock(&svms->lock); + + /* check if this page fault time stamp is before svms->checkpoint_ts */ + if (svms->checkpoint_ts[gpuidx] != 0) { + if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) { + pr_debug("draining retry fault, drop fault 0x%llx\n", addr); + r = -EAGAIN; + goto out_unlock_svms; + } else { + /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts + * to zero to avoid following ts wrap around give wrong comparing + */ + svms->checkpoint_ts[gpuidx] = 0; + } + } + prange = svm_range_from_addr(svms, addr, NULL); if (!prange) { pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", @@ -3028,8 +3114,6 @@ retry_write_locked: start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start); last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last); if (prange->actual_loc != 0 || best_loc != 0) { - migration = true; - if (best_loc) { r = svm_migrate_to_vram(prange, best_loc, start, last, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); @@ -3052,7 +3136,9 @@ retry_write_locked: if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", r, svms, start, last); - goto out_unlock_range; + goto out_migrate_fail; + } else { + migration = true; } } @@ -3062,6 +3148,7 @@ retry_write_locked: pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", r, svms, start, last); +out_migrate_fail: kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, migration); @@ -3071,7 +3158,8 @@ out_unlock_svms: mutex_unlock(&svms->lock); mmap_read_unlock(mm); - svm_range_count_fault(node, p, gpuidx); + if (r != -EAGAIN) + svm_range_count_fault(node, p, gpuidx); mmput(mm); out: @@ -3148,7 +3236,8 @@ void svm_range_list_fini(struct kfd_process *p) struct svm_range *prange; struct svm_range *next; - pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); + pr_debug("process pid %d svms 0x%p\n", p->lead_thread->pid, + &p->svms); cancel_delayed_work_sync(&p->svms.restore_work); @@ -3158,8 +3247,9 @@ void svm_range_list_fini(struct kfd_process *p) /* * Ensure no retry fault comes in afterwards, as page fault handler will * not find kfd process and take mm lock to recover fault. + * stop kfd page fault handing, then wait pending page faults got drained */ - atomic_inc(&p->svms.drain_pagefaults); + atomic_set(&p->svms.drain_pagefaults, 1); svm_range_drain_retry_fault(&p->svms); list_for_each_entry_safe(prange, next, &p->svms.list, list) { @@ -3170,7 +3260,8 @@ void svm_range_list_fini(struct kfd_process *p) mutex_destroy(&p->svms.lock); - pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); + pr_debug("process pid %d svms 0x%p done\n", + p->lead_thread->pid, &p->svms); } int svm_range_list_init(struct kfd_process *p) @@ -3193,6 +3284,12 @@ int svm_range_list_init(struct kfd_process *p) if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev)) bitmap_set(svms->bitmap_supported, i, 1); + /* Value of default granularity cannot exceed 0x1B, the + * number of pages supported by a 4-level paging table + */ + svms->default_granularity = min_t(u8, amdgpu_svm_default_granularity, 0x1B); + pr_debug("Default SVM Granularity to use: %d\n", svms->default_granularity); + return 0; } @@ -3337,7 +3434,7 @@ svm_range_best_prefetch_location(struct svm_range *prange) goto out; } - if (bo_node->adev->gmc.is_app_apu) { + if (bo_node->adev->apu_prefer_gtt) { best_loc = 0; goto out; } @@ -3426,7 +3523,7 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); *migrated = !r; - return r; + return 0; } int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) @@ -3527,8 +3624,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, bool flush_tlb; int r, ret = 0; - pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", - p->pasid, &p->svms, start, start + size - 1, size); + pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", + p->lead_thread->pid, &p->svms, start, start + size - 1, size); r = svm_range_check_attr(p, nattr, attrs); if (r) @@ -3636,8 +3733,8 @@ out_unlock_range: out: mutex_unlock(&process_info->lock); - pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, - &p->svms, start, start + size - 1, r); + pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] done, r=%d\n", + p->lead_thread->pid, &p->svms, start, start + size - 1, r); return ret ? ret : r; } @@ -3720,7 +3817,7 @@ svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, node = interval_tree_iter_first(&svms->objects, start, last); if (!node) { pr_debug("range attrs not found return default values\n"); - svm_range_set_default_attributes(&location, &prefetch_loc, + svm_range_set_default_attributes(svms, &location, &prefetch_loc, &granularity, &flags_and); flags_or = flags_and; if (p->xnack_enabled) @@ -3975,8 +4072,8 @@ exit: return ret; } -int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, - uint64_t *svm_priv_data_size) +void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, + uint64_t *svm_priv_data_size) { uint64_t total_size, accessibility_size, common_attr_size; int nattr_common = 4, nattr_accessibility = 1; @@ -3988,8 +4085,6 @@ int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, *svm_priv_data_size = 0; svms = &p->svms; - if (!svms) - return -EINVAL; mutex_lock(&svms->lock); list_for_each_entry(prange, &svms->list, list) { @@ -4031,7 +4126,6 @@ int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges, *svm_priv_data_size); - return 0; } int kfd_criu_checkpoint_svm(struct kfd_process *p, @@ -4048,8 +4142,6 @@ int kfd_criu_checkpoint_svm(struct kfd_process *p, struct mm_struct *mm; svms = &p->svms; - if (!svms) - return -EINVAL; mm = get_task_mm(p->lead_thread); if (!mm) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 026863a0abcd..01c7a4877904 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -137,6 +137,7 @@ struct svm_range { DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); bool mapped_to_gpu; + atomic_t queue_refcount; }; static inline void svm_range_lock(struct svm_range *prange) @@ -173,7 +174,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear); void svm_range_vram_node_free(struct svm_range *prange); int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, - uint32_t vmid, uint32_t node_id, uint64_t addr, + uint32_t vmid, uint32_t node_id, uint64_t addr, uint64_t ts, bool write_fault); int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence); void svm_range_add_list_work(struct svm_range_list *svms, @@ -183,8 +184,8 @@ void schedule_deferred_list_work(struct svm_range_list *svms); void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, unsigned long offset, unsigned long npages); void svm_range_dma_unmap(struct svm_range *prange); -int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, - uint64_t *svm_priv_data_size); +void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, + uint64_t *svm_priv_data_size); int kfd_criu_checkpoint_svm(struct kfd_process *p, uint8_t __user *user_priv_data, uint64_t *priv_offset); @@ -201,7 +202,7 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s * is initialized to not 0 when page migration register device memory. */ #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\ - (adev)->gmc.is_app_apu) + ((adev)->apu_prefer_gtt)) void svm_range_bo_unref_async(struct svm_range_bo *svm_bo); @@ -224,7 +225,7 @@ static inline void svm_range_list_fini(struct kfd_process *p) static inline int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t client_id, uint32_t node_id, - uint64_t addr, bool write_fault) + uint64_t addr, uint64_t ts, bool write_fault) { return -EFAULT; } @@ -236,13 +237,12 @@ static inline int svm_range_schedule_evict_svm_bo( return -EINVAL; } -static inline int svm_range_get_info(struct kfd_process *p, - uint32_t *num_svm_ranges, - uint64_t *svm_priv_data_size) +static inline void svm_range_get_info(struct kfd_process *p, + uint32_t *num_svm_ranges, + uint64_t *svm_priv_data_size) { *num_svm_ranges = 0; *svm_priv_data_size = 0; - return 0; } static inline int kfd_criu_checkpoint_svm(struct kfd_process *p, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index c51f131eaa2f..4ec73f33535e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -31,6 +31,7 @@ #include <linux/log2.h> #include <linux/dmi.h> #include <linux/atomic.h> +#include <linux/crc16.h> #include "kfd_priv.h" #include "kfd_crat.h" @@ -107,24 +108,6 @@ struct kfd_node *kfd_device_by_id(uint32_t gpu_id) return top_dev->gpu; } -struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev) -{ - struct kfd_topology_device *top_dev; - struct kfd_node *device = NULL; - - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->adev->pdev == pdev) { - device = top_dev->gpu; - break; - } - - up_read(&topology_lock); - - return device; -} - /* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { @@ -291,6 +274,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, iolink->max_bandwidth); sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", iolink->rec_transfer_size); + sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask", + iolink->rec_sdma_eng_id_mask); sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); return offs; @@ -525,6 +510,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.capability |= HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0) && + (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)) + dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; + sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute", dev->node_props.max_engine_clk_fcompute); @@ -534,6 +523,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->gpu->kfd->mec_fw_version); sysfs_show_32bit_prop(buffer, offs, "capability", dev->node_props.capability); + sysfs_show_32bit_prop(buffer, offs, "capability2", + dev->node_props.capability2); sysfs_show_64bit_prop(buffer, offs, "debug_prop", dev->node_props.debug_prop); sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", @@ -958,32 +949,30 @@ static void kfd_update_system_properties(void) dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); if (dev) { - sys_props.platform_id = - (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_id = dev->oem_id64; sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); sys_props.platform_rev = dev->oem_revision; } up_read(&topology_lock); } -static void find_system_memory(const struct dmi_header *dm, - void *private) +static void find_system_memory(const struct dmi_header *dm, void *private) { + struct dmi_mem_device *memdev = container_of(dm, struct dmi_mem_device, header); struct kfd_mem_properties *mem; - u16 mem_width, mem_clock; struct kfd_topology_device *kdev = (struct kfd_topology_device *)private; - const u8 *dmi_data = (const u8 *)(dm + 1); - - if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { - mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); - mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); - list_for_each_entry(mem, &kdev->mem_props, list) { - if (mem_width != 0xFFFF && mem_width != 0) - mem->width = mem_width; - if (mem_clock != 0) - mem->mem_clk_max = mem_clock; - } + + if (memdev->header.type != DMI_ENTRY_MEM_DEVICE) + return; + if (memdev->header.length < sizeof(struct dmi_mem_device)) + return; + + list_for_each_entry(mem, &kdev->mem_props, list) { + if (memdev->total_width != 0xFFFF && memdev->total_width != 0) + mem->width = memdev->total_width; + if (memdev->speed != 0) + mem->mem_clk_max = memdev->speed; } } @@ -1092,14 +1081,17 @@ void kfd_topology_shutdown(void) static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) { - uint32_t hashout; + uint32_t gpu_id; uint32_t buf[8]; uint64_t local_mem_size; - int i; + struct kfd_topology_device *dev; + bool is_unique; + uint8_t *crc_buf; if (!gpu) return 0; + crc_buf = (uint8_t *)&buf; local_mem_size = gpu->local_mem_info.local_mem_size_private + gpu->local_mem_info.local_mem_size_public; buf[0] = gpu->adev->pdev->devfn; @@ -1112,10 +1104,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) buf[6] = upper_32_bits(local_mem_size); buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16); - for (i = 0, hashout = 0; i < 8; i++) - hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + gpu_id = crc16(0, crc_buf, sizeof(buf)) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); - return hashout; + /* There is a very small possibility when generating a + * 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer + * that the value could be 0 or non-unique. So, check if + * it is unique and non-zero. If not unique increment till + * unique one is found. In case of overflow, restart from 1 + */ + + down_read(&topology_lock); + do { + is_unique = true; + if (!gpu_id) + gpu_id = 1; + list_for_each_entry(dev, &topology_device_list, list) { + if (dev->gpu && dev->gpu_id == gpu_id) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) + gpu_id = (gpu_id + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + } while (!is_unique); + up_read(&topology_lock); + + return gpu_id; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If * the GPU device is not already present in the topology device @@ -1238,6 +1254,61 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev, } } +#define REC_SDMA_NUM_GPU 8 +static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = { + { -1, 14, 12, 2, 4, 8, 10, 6 }, + { 14, -1, 2, 10, 8, 4, 6, 12 }, + { 10, 2, -1, 12, 14, 6, 4, 8 }, + { 2, 12, 10, -1, 6, 14, 8, 4 }, + { 4, 8, 14, 6, -1, 10, 12, 2 }, + { 8, 4, 6, 14, 12, -1, 2, 10 }, + { 10, 6, 4, 8, 12, 2, -1, 14 }, + { 6, 12, 8, 4, 2, 10, 14, -1 }}; + +static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev, + struct kfd_iolink_properties *outbound_link, + struct kfd_iolink_properties *inbound_link) +{ + struct kfd_node *gpu = outbound_link->gpu; + struct amdgpu_device *adev = gpu->adev; + unsigned int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; + unsigned int num_xgmi_sdma_engines = kfd_get_num_xgmi_sdma_engines(gpu); + unsigned int num_sdma_engines = kfd_get_num_sdma_engines(gpu); + uint32_t sdma_eng_id_mask = (1 << num_sdma_engines) - 1; + uint32_t xgmi_sdma_eng_id_mask = + ((1 << num_xgmi_sdma_engines) - 1) << num_sdma_engines; + + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu && + adev->aid_mask && num_xgmi_nodes && gpu->kfd->num_nodes == 1 && + num_xgmi_sdma_engines >= 6 && (!(adev->flags & AMD_IS_APU) && + num_xgmi_nodes == 8); + + if (support_rec_eng) { + int src_socket_id = adev->gmc.xgmi.physical_node_id; + int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id; + unsigned int reshift = num_xgmi_sdma_engines == 6 ? 1 : 0; + + outbound_link->rec_sdma_eng_id_mask = + 1 << (rec_sdma_eng_map[src_socket_id][dst_socket_id] >> reshift); + inbound_link->rec_sdma_eng_id_mask = + 1 << (rec_sdma_eng_map[dst_socket_id][src_socket_id] >> reshift); + + /* If recommended engine is out of range, need to reset the mask */ + if (outbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask) + outbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask; + if (inbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask) + inbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask; + + } else { + uint32_t engine_mask = (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && + num_xgmi_sdma_engines && to_dev->gpu) ? xgmi_sdma_eng_id_mask : + sdma_eng_id_mask; + + outbound_link->rec_sdma_eng_id_mask = engine_mask; + inbound_link->rec_sdma_eng_id_mask = engine_mask; + } +} + static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) { struct kfd_iolink_properties *link, *inbound_link; @@ -1276,6 +1347,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED; kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link); kfd_set_iolink_non_coherent(peer_dev, link, inbound_link); + kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link); } } @@ -1605,17 +1677,32 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, int cache_type, unsigned int cu_processor_id, struct kfd_node *knode) { - unsigned int cu_sibling_map_mask; + unsigned int cu_sibling_map_mask = 0; int first_active_cu; int i, j, k, xcc, start, end; int num_xcc = NUM_XCC(knode->xcc_mask); struct kfd_cache_properties *pcache = NULL; enum amdgpu_memory_partition mode; struct amdgpu_device *adev = knode->adev; + bool found = false; start = ffs(knode->xcc_mask) - 1; end = start + num_xcc; - cu_sibling_map_mask = cu_info->bitmap[start][0][0]; + + /* To find the bitmap in the first active cu in the first + * xcc, it is based on the assumption that evrey xcc must + * have at least one active cu. + */ + for (i = 0; i < gfx_info->max_shader_engines && !found; i++) { + for (j = 0; j < gfx_info->max_sh_per_se && !found; j++) { + if (cu_info->bitmap[start][i % 4][j % 4]) { + cu_sibling_map_mask = + cu_info->bitmap[start][i % 4][j % 4]; + found = true; + } + } + } + cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); first_active_cu = ffs(cu_sibling_map_mask); @@ -1635,7 +1722,9 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, pcache->cache_level = pcache_info[cache_type].cache_level; pcache->cacheline_size = pcache_info[cache_type].cache_line_size; - if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 5, 0)) mode = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); else mode = UNKNOWN_MEMORY_PARTITION_MODE; @@ -1697,7 +1786,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info; struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config; int gpu_processor_id; - struct kfd_cache_properties *props_ext; + struct kfd_cache_properties *props_ext = NULL; int num_of_entries = 0; int num_of_cache_types = 0; struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES]; @@ -1772,7 +1861,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct pr_debug("Added [%d] GPU cache entries\n", num_of_entries); } -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, +static int kfd_topology_add_device_locked(struct kfd_node *gpu, struct kfd_topology_device **dev) { int proximity_domain = ++topology_crat_proximity_domain; @@ -1785,8 +1874,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, COMPUTE_UNIT_GPU, gpu, proximity_domain); if (res) { - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error creating VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1797,8 +1885,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, &temp_topology_device_list, proximity_domain); if (res) { - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error parsing VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1824,8 +1911,8 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, if (!res) sys_props.generation_count++; else - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", - gpu_id, res); + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs topology. res=%d\n", + res); err: kfd_destroy_crat_image(crat_image); @@ -1908,7 +1995,8 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { - if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 4)) dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9_4_3 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT_GFX9_4_3; @@ -1920,13 +2008,17 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 4, 2)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + + if (!amdgpu_sriov_vf(dev->gpu->adev)) + dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED; + } else { dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT; - if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0)) + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0)) dev->node_props.capability |= - HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED; } kfd_topology_set_dbg_firmware_support(dev); @@ -1942,14 +2034,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config; struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { dev_warn(gpu->adev->dev, - "Won't add GPU (ID: 0x%x) to topology since it has no drm node assigned.", - gpu_id); + "Won't add GPU to topology since it has no drm node assigned."); return 0; } else { - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n"); } /* Check to see if this gpu device exists in the topology_device_list. @@ -1961,11 +2051,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) down_write(&topology_lock); dev = kfd_assign_gpu(gpu); if (!dev) - res = kfd_topology_add_device_locked(gpu, gpu_id, &dev); + res = kfd_topology_add_device_locked(gpu, &dev); up_write(&topology_lock); if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id; @@ -1997,9 +2088,8 @@ int kfd_topology_add_device(struct kfd_node *gpu) HSA_CAP_ASIC_REVISION_MASK); dev->node_props.location_id = pci_dev_id(gpu->adev->pdev); - /* On multi-partition nodes, node id = location_id[31:28] */ if (gpu->kfd->num_nodes > 1) - dev->node_props.location_id |= (dev->gpu->node_id << 28); + dev->node_props.location_id |= dev->gpu->node_id; dev->node_props.domain = pci_domain_nr(gpu->adev->pdev->bus); dev->node_props.max_engine_clk_fcompute = @@ -2091,6 +2181,8 @@ int kfd_topology_add_device(struct kfd_node *gpu) dev->gpu->adev->gmc.xgmi.connected_to_cpu) dev->node_props.capability |= HSA_CAP_FLAGS_COHERENTHOSTACCESS; + kfd_queue_ctx_save_restore_size(dev); + kfd_debug_print_topology(); kfd_notify_gpu_change(gpu_id, 1); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 27386ce9a021..3de8ec0043bb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -24,6 +24,7 @@ #ifndef __KFD_TOPOLOGY_H__ #define __KFD_TOPOLOGY_H__ +#include <linux/dmi.h> #include <linux/types.h> #include <linux/list.h> #include <linux/kfd_sysfs.h> @@ -50,6 +51,7 @@ struct kfd_node_properties { uint32_t cpu_core_id_base; uint32_t simd_id_base; uint32_t capability; + uint32_t capability2; uint64_t debug_prop; uint32_t max_waves_per_simd; uint32_t lds_size_in_kb; @@ -74,6 +76,10 @@ struct kfd_node_properties { uint32_t num_sdma_xgmi_engines; uint32_t num_sdma_queues_per_engine; uint32_t num_cp_queues; + uint32_t cwsr_size; + uint32_t ctl_stack_size; + uint32_t eop_buffer_size; + uint32_t debug_memory_size; char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; }; @@ -121,6 +127,7 @@ struct kfd_iolink_properties { uint32_t min_bandwidth; uint32_t max_bandwidth; uint32_t rec_transfer_size; + uint32_t rec_sdma_eng_id_mask; uint32_t flags; struct kfd_node *gpu; struct kobject *kobj; @@ -154,7 +161,10 @@ struct kfd_topology_device { struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; - uint8_t oem_id[CRAT_OEMID_LENGTH]; + union { + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint64_t oem_id64; + }; uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; uint32_t oem_revision; }; @@ -171,6 +181,22 @@ struct kfd_system_properties { struct attribute attr_props; }; +struct dmi_mem_device { + struct dmi_header header; + u16 physical_handle; + u16 error_handle; + u16 total_width; + u16 data_width; + u16 size; + u8 form_factor; + u8 device_set; + u8 device_locator; + u8 bank_locator; + u8 memory_type; + u16 type_detail; + u16 speed; +} __packed; + struct kfd_topology_device *kfd_create_topology_device( struct list_head *device_list); void kfd_release_topology_device_list(struct list_head *device_list); diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h index 10138676f27f..e5c0205f2618 100644 --- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h +++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h @@ -29,6 +29,7 @@ #define SOC15_INTSRC_CP_BAD_OPCODE 183 #define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 #define SOC15_INTSRC_VMC_FAULT 0 +#define SOC15_INTSRC_VMC_UTCL2_POISON 1 #define SOC15_INTSRC_SDMA_TRAP 224 #define SOC15_INTSRC_SDMA_ECC 220 #define SOC21_INTSRC_SDMA_TRAP 49 |