diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-15 15:52:01 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-15 15:52:01 -0800 |
commit | 988adfdffdd43cfd841df734664727993076d7cb (patch) | |
tree | 6794f7bba8f595500c2b7d33376ad6614adcfaf2 /drivers/gpu/drm/amd/amdkfd | |
parent | 26178ec11ef3c6c814bf16a0a2b9c2f7242e3c64 (diff) | |
parent | 4e0cd68115620bc3236ff4e58e4c073948629b41 (diff) |
Merge branch 'drm-next' of git://people.freedesktop.org/~airlied/linux
Pull drm updates from Dave Airlie:
"Highlights:
- AMD KFD driver merge
This is the AMD HSA interface for exposing a lowlevel interface for
GPGPU use. They have an open source userspace built on top of this
interface, and the code looks as good as it was going to get out of
tree.
- Initial atomic modesetting work
The need for an atomic modesetting interface to allow userspace to
try and send a complete set of modesetting state to the driver has
arisen, and been suffering from neglect this past year. No more,
the start of the common code and changes for msm driver to use it
are in this tree. Ongoing work to get the userspace ioctl finished
and the code clean will probably wait until next kernel.
- DisplayID 1.3 and tiled monitor exposed to userspace.
Tiled monitor property is now exposed for userspace to make use of.
- Rockchip drm driver merged.
- imx gpu driver moved out of staging
Other stuff:
- core:
panel - MIPI DSI + new panels.
expose suggested x/y properties for virtual GPUs
- i915:
Initial Skylake (SKL) support
gen3/4 reset work
start of dri1/ums removal
infoframe tracking
fixes for lots of things.
- nouveau:
tegra k1 voltage support
GM204 modesetting support
GT21x memory reclocking work
- radeon:
CI dpm fixes
GPUVM improvements
Initial DPM fan control
- rcar-du:
HDMI support added
removed some support for old boards
slave encoder driver for Analog Devices adv7511
- exynos:
Exynos4415 SoC support
- msm:
a4xx gpu support
atomic helper conversion
- tegra:
iommu support
universal plane support
ganged-mode DSI support
- sti:
HDMI i2c improvements
- vmwgfx:
some late fixes.
- qxl:
use suggested x/y properties"
* 'drm-next' of git://people.freedesktop.org/~airlied/linux: (969 commits)
drm: sti: fix module compilation issue
drm/i915: save/restore GMBUS freq across suspend/resume on gen4
drm: sti: correctly cleanup CRTC and planes
drm: sti: add HQVDP plane
drm: sti: add cursor plane
drm: sti: enable auxiliary CRTC
drm: sti: fix delay in VTG programming
drm: sti: prepare sti_tvout to support auxiliary crtc
drm: sti: use drm_crtc_vblank_{on/off} instead of drm_vblank_{on/off}
drm: sti: fix hdmi avi infoframe
drm: sti: remove event lock while disabling vblank
drm: sti: simplify gdp code
drm: sti: clear all mixer control
drm: sti: remove gpio for HDMI hot plug detection
drm: sti: allow to change hdmi ddc i2c adapter
drm/doc: Document drm_add_modes_noedid() usage
drm/i915: Remove '& 0xffff' from the mask given to WA_REG()
drm/i915: Invert the mask and val arguments in wa_add() and WA_REG()
drm: Zero out DRM object memory upon cleanup
drm/i915/bdw: Fix the write setting up the WIZ hashing mode
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
26 files changed, 8469 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig new file mode 100644 index 000000000000..8dfac37ff327 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -0,0 +1,9 @@ +# +# Heterogenous system architecture configuration +# + +config HSA_AMD + tristate "HSA kernel driver for AMD GPU devices" + depends on DRM_RADEON && AMD_IOMMU_V2 && X86_64 + help + Enable this if you want to use HSA features on AMD GPU devices. diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile new file mode 100644 index 000000000000..be6246de5091 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for Heterogenous System Architecture support for AMD GPU devices +# + +ccflags-y := -Iinclude/drm -Idrivers/gpu/drm/amd/include/ + +amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ + kfd_process.o kfd_queue.o kfd_mqd_manager.o \ + kfd_kernel_queue.o kfd_packet_manager.o \ + kfd_process_queue_manager.o kfd_device_queue_manager.o \ + kfd_interrupt.o + +obj-$(CONFIG_HSA_AMD) += amdkfd.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h new file mode 100644 index 000000000000..607fc5ceadbe --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h @@ -0,0 +1,221 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIK_REGS_H +#define CIK_REGS_H + +#define IH_VMID_0_LUT 0x3D40u + +#define BIF_DOORBELL_CNTL 0x530Cu + +#define SRBM_GFX_CNTL 0xE44 +#define PIPEID(x) ((x) << 0) +#define MEID(x) ((x) << 2) +#define VMID(x) ((x) << 4) +#define QUEUEID(x) ((x) << 8) + +#define SQ_CONFIG 0x8C00 + +#define SH_MEM_BASES 0x8C28 +/* if PTR32, these are the bases for scratch and lds */ +#define PRIVATE_BASE(x) ((x) << 0) /* scratch */ +#define SHARED_BASE(x) ((x) << 16) /* LDS */ +#define SH_MEM_APE1_BASE 0x8C2C +/* if PTR32, this is the base location of GPUVM */ +#define SH_MEM_APE1_LIMIT 0x8C30 +/* if PTR32, this is the upper limit of GPUVM */ +#define SH_MEM_CONFIG 0x8C34 +#define PTR32 (1 << 0) +#define PRIVATE_ATC (1 << 1) +#define ALIGNMENT_MODE(x) ((x) << 2) +#define SH_MEM_ALIGNMENT_MODE_DWORD 0 +#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1 +#define SH_MEM_ALIGNMENT_MODE_STRICT 2 +#define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 +#define DEFAULT_MTYPE(x) ((x) << 4) +#define APE1_MTYPE(x) ((x) << 7) + +/* valid for both DEFAULT_MTYPE and APE1_MTYPE */ +#define MTYPE_CACHED 0 +#define MTYPE_NONCACHED 3 + + +#define SH_STATIC_MEM_CONFIG 0x9604u + +#define TC_CFG_L1_LOAD_POLICY0 0xAC68 +#define TC_CFG_L1_LOAD_POLICY1 0xAC6C +#define TC_CFG_L1_STORE_POLICY 0xAC70 +#define TC_CFG_L2_LOAD_POLICY0 0xAC74 +#define TC_CFG_L2_LOAD_POLICY1 0xAC78 +#define TC_CFG_L2_STORE_POLICY0 0xAC7C +#define TC_CFG_L2_STORE_POLICY1 0xAC80 +#define TC_CFG_L2_ATOMIC_POLICY 0xAC84 +#define TC_CFG_L1_VOLATILE 0xAC88 +#define TC_CFG_L2_VOLATILE 0xAC8C + +#define CP_PQ_WPTR_POLL_CNTL 0xC20C +#define WPTR_POLL_EN (1 << 31) + +#define CPC_INT_CNTL 0xC2D0 +#define CP_ME1_PIPE0_INT_CNTL 0xC214 +#define CP_ME1_PIPE1_INT_CNTL 0xC218 +#define CP_ME1_PIPE2_INT_CNTL 0xC21C +#define CP_ME1_PIPE3_INT_CNTL 0xC220 +#define CP_ME2_PIPE0_INT_CNTL 0xC224 +#define CP_ME2_PIPE1_INT_CNTL 0xC228 +#define CP_ME2_PIPE2_INT_CNTL 0xC22C +#define CP_ME2_PIPE3_INT_CNTL 0xC230 +#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13) +#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17) +#define PRIV_REG_INT_ENABLE (1 << 23) +#define TIME_STAMP_INT_ENABLE (1 << 26) +#define GENERIC2_INT_ENABLE (1 << 29) +#define GENERIC1_INT_ENABLE (1 << 30) +#define GENERIC0_INT_ENABLE (1 << 31) +#define CP_ME1_PIPE0_INT_STATUS 0xC214 +#define CP_ME1_PIPE1_INT_STATUS 0xC218 +#define CP_ME1_PIPE2_INT_STATUS 0xC21C +#define CP_ME1_PIPE3_INT_STATUS 0xC220 +#define CP_ME2_PIPE0_INT_STATUS 0xC224 +#define CP_ME2_PIPE1_INT_STATUS 0xC228 +#define CP_ME2_PIPE2_INT_STATUS 0xC22C +#define CP_ME2_PIPE3_INT_STATUS 0xC230 +#define DEQUEUE_REQUEST_INT_STATUS (1 << 13) +#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17) +#define PRIV_REG_INT_STATUS (1 << 23) +#define TIME_STAMP_INT_STATUS (1 << 26) +#define GENERIC2_INT_STATUS (1 << 29) +#define GENERIC1_INT_STATUS (1 << 30) +#define GENERIC0_INT_STATUS (1 << 31) + +#define CP_HPD_EOP_BASE_ADDR 0xC904 +#define CP_HPD_EOP_BASE_ADDR_HI 0xC908 +#define CP_HPD_EOP_VMID 0xC90C +#define CP_HPD_EOP_CONTROL 0xC910 +#define EOP_SIZE(x) ((x) << 0) +#define EOP_SIZE_MASK (0x3f << 0) +#define CP_MQD_BASE_ADDR 0xC914 +#define CP_MQD_BASE_ADDR_HI 0xC918 +#define CP_HQD_ACTIVE 0xC91C +#define CP_HQD_VMID 0xC920 + +#define CP_HQD_PERSISTENT_STATE 0xC924u +#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) +#define PRELOAD_REQ (1 << 0) + +#define CP_HQD_PIPE_PRIORITY 0xC928u +#define CP_HQD_QUEUE_PRIORITY 0xC92Cu +#define CP_HQD_QUANTUM 0xC930u +#define QUANTUM_EN 1U +#define QUANTUM_SCALE_1MS (1U << 4) +#define QUANTUM_DURATION(x) ((x) << 8) + +#define CP_HQD_PQ_BASE 0xC934 +#define CP_HQD_PQ_BASE_HI 0xC938 +#define CP_HQD_PQ_RPTR 0xC93C +#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940 +#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944 +#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948 +#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C +#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950 +#define DOORBELL_OFFSET(x) ((x) << 2) +#define DOORBELL_OFFSET_MASK (0x1fffff << 2) +#define DOORBELL_SOURCE (1 << 28) +#define DOORBELL_SCHD_HIT (1 << 29) +#define DOORBELL_EN (1 << 30) +#define DOORBELL_HIT (1 << 31) +#define CP_HQD_PQ_WPTR 0xC954 +#define CP_HQD_PQ_CONTROL 0xC958 +#define QUEUE_SIZE(x) ((x) << 0) +#define QUEUE_SIZE_MASK (0x3f << 0) +#define RPTR_BLOCK_SIZE(x) ((x) << 8) +#define RPTR_BLOCK_SIZE_MASK (0x3f << 8) +#define MIN_AVAIL_SIZE(x) ((x) << 20) +#define PQ_ATC_EN (1 << 23) +#define PQ_VOLATILE (1 << 26) +#define NO_UPDATE_RPTR (1 << 27) +#define UNORD_DISPATCH (1 << 28) +#define ROQ_PQ_IB_FLIP (1 << 29) +#define PRIV_STATE (1 << 30) +#define KMD_QUEUE (1 << 31) + +#define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) +#define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) + +#define CP_HQD_IB_BASE_ADDR 0xC95Cu +#define CP_HQD_IB_BASE_ADDR_HI 0xC960u +#define CP_HQD_IB_RPTR 0xC964u +#define CP_HQD_IB_CONTROL 0xC968u +#define IB_ATC_EN (1U << 23) +#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) + +#define CP_HQD_DEQUEUE_REQUEST 0xC974 +#define DEQUEUE_REQUEST_DRAIN 1 +#define DEQUEUE_REQUEST_RESET 2 +#define DEQUEUE_INT (1U << 8) + +#define CP_HQD_SEMA_CMD 0xC97Cu +#define CP_HQD_MSG_TYPE 0xC980u +#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u +#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u +#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu +#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u +#define CP_HQD_HQ_SCHEDULER0 0xC994u +#define CP_HQD_HQ_SCHEDULER1 0xC998u + + +#define CP_MQD_CONTROL 0xC99C +#define MQD_VMID(x) ((x) << 0) +#define MQD_VMID_MASK (0xf << 0) +#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) + +#define GRBM_GFX_INDEX 0x30800 +#define INSTANCE_INDEX(x) ((x) << 0) +#define SH_INDEX(x) ((x) << 8) +#define SE_INDEX(x) ((x) << 16) +#define SH_BROADCAST_WRITES (1 << 29) +#define INSTANCE_BROADCAST_WRITES (1 << 30) +#define SE_BROADCAST_WRITES (1 << 31) + +#define SQC_CACHES 0x30d20 +#define SQC_POLICY 0x8C38u +#define SQC_VOLATILE 0x8C3Cu + +#define CP_PERFMON_CNTL 0x36020 + +#define ATC_VMID0_PASID_MAPPING 0x339Cu +#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u +#define ATC_VMID_PASID_MAPPING_VALID (1U << 31) + +#define ATC_VM_APERTURE0_CNTL 0x3310u +#define ATS_ACCESS_MODE_NEVER 0 +#define ATS_ACCESS_MODE_ALWAYS 1 + +#define ATC_VM_APERTURE0_CNTL2 0x3318u +#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u +#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u +#define ATC_VM_APERTURE1_CNTL 0x3314u +#define ATC_VM_APERTURE1_CNTL2 0x331Cu +#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu +#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c new file mode 100644 index 000000000000..4f7b275f2f7b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -0,0 +1,595 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/device.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <uapi/linux/kfd_ioctl.h> +#include <linux/time.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <uapi/asm-generic/mman-common.h> +#include <asm/processor.h> +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" + +static long kfd_ioctl(struct file *, unsigned int, unsigned long); +static int kfd_open(struct inode *, struct file *); +static int kfd_mmap(struct file *, struct vm_area_struct *); + +static const char kfd_dev_name[] = "kfd"; + +static const struct file_operations kfd_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = kfd_ioctl, + .compat_ioctl = kfd_ioctl, + .open = kfd_open, + .mmap = kfd_mmap, +}; + +static int kfd_char_dev_major = -1; +static struct class *kfd_class; +struct device *kfd_device; + +int kfd_chardev_init(void) +{ + int err = 0; + + kfd_char_dev_major = register_chrdev(0, kfd_dev_name, &kfd_fops); + err = kfd_char_dev_major; + if (err < 0) + goto err_register_chrdev; + + kfd_class = class_create(THIS_MODULE, kfd_dev_name); + err = PTR_ERR(kfd_class); + if (IS_ERR(kfd_class)) + goto err_class_create; + + kfd_device = device_create(kfd_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); + err = PTR_ERR(kfd_device); + if (IS_ERR(kfd_device)) + goto err_device_create; + + return 0; + +err_device_create: + class_destroy(kfd_class); +err_class_create: + unregister_chrdev(kfd_char_dev_major, kfd_dev_name); +err_register_chrdev: + return err; +} + +void kfd_chardev_exit(void) +{ + device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0)); + class_destroy(kfd_class); + unregister_chrdev(kfd_char_dev_major, kfd_dev_name); +} + +struct device *kfd_chardev(void) +{ + return kfd_device; +} + + +static int kfd_open(struct inode *inode, struct file *filep) +{ + struct kfd_process *process; + bool is_32bit_user_mode; + + if (iminor(inode) != 0) + return -ENODEV; + + is_32bit_user_mode = is_compat_task(); + + if (is_32bit_user_mode == true) { + dev_warn(kfd_device, + "Process %d (32-bit) failed to open /dev/kfd\n" + "32-bit processes are not supported by amdkfd\n", + current->pid); + return -EPERM; + } + + process = kfd_create_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + + process->is_32bit_user_mode = is_32bit_user_mode; + + dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", + process->pasid, process->is_32bit_user_mode); + + kfd_init_apertures(process); + + return 0; +} + +static long kfd_ioctl_get_version(struct file *filep, struct kfd_process *p, + void __user *arg) +{ + struct kfd_ioctl_get_version_args args; + int err = 0; + + args.major_version = KFD_IOCTL_MAJOR_VERSION; + args.minor_version = KFD_IOCTL_MINOR_VERSION; + + if (copy_to_user(arg, &args, sizeof(args))) + err = -EFAULT; + + return err; +} + +static int set_queue_properties_from_user(struct queue_properties *q_properties, + struct kfd_ioctl_create_queue_args *args) +{ + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { + pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { + pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + + if ((args->ring_base_address) && + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { + pr_err("kfd: can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { + pr_err("kfd: ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->read_pointer_address, + sizeof(uint32_t))) { + pr_err("kfd: can't access read pointer\n"); + return -EFAULT; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->write_pointer_address, + sizeof(uint32_t))) { + pr_err("kfd: can't access write pointer\n"); + return -EFAULT; + } + + q_properties->is_interop = false; + q_properties->queue_percent = args->queue_percentage; + q_properties->priority = args->queue_priority; + q_properties->queue_address = args->ring_base_address; + q_properties->queue_size = args->ring_size; + q_properties->read_ptr = (uint32_t *) args->read_pointer_address; + q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || + args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->type = KFD_QUEUE_TYPE_COMPUTE; + else + return -ENOTSUPP; + + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->format = KFD_QUEUE_FORMAT_AQL; + else + q_properties->format = KFD_QUEUE_FORMAT_PM4; + + pr_debug("Queue Percentage (%d, %d)\n", + q_properties->queue_percent, args->queue_percentage); + + pr_debug("Queue Priority (%d, %d)\n", + q_properties->priority, args->queue_priority); + + pr_debug("Queue Address (0x%llX, 0x%llX)\n", + q_properties->queue_address, args->ring_base_address); + + pr_debug("Queue Size (0x%llX, %u)\n", + q_properties->queue_size, args->ring_size); + + pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n", + (uint64_t) q_properties->read_ptr, + (uint64_t) q_properties->write_ptr); + + pr_debug("Queue Format (%d)\n", q_properties->format); + + return 0; +} + +static long kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + void __user *arg) +{ + struct kfd_ioctl_create_queue_args args; + struct kfd_dev *dev; + int err = 0; + unsigned int queue_id; + struct kfd_process_device *pdd; + struct queue_properties q_properties; + + memset(&q_properties, 0, sizeof(struct queue_properties)); + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + pr_debug("kfd: creating queue ioctl\n"); + + err = set_queue_properties_from_user(&q_properties, &args); + if (err) + return err; + + dev = kfd_device_by_id(args.gpu_id); + if (dev == NULL) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto err_bind_process; + } + + pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n", + p->pasid, + dev->id); + + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, 0, + KFD_QUEUE_TYPE_COMPUTE, &queue_id); + if (err != 0) + goto err_create_queue; + + args.queue_id = queue_id; + + /* Return gpu_id as doorbell offset for mmap usage */ + args.doorbell_offset = args.gpu_id << PAGE_SHIFT; + + if (copy_to_user(arg, &args, sizeof(args))) { + err = -EFAULT; + goto err_copy_args_out; + } + + mutex_unlock(&p->mutex); + + pr_debug("kfd: queue id %d was created successfully\n", args.queue_id); + + pr_debug("ring buffer address == 0x%016llX\n", + args.ring_base_address); + + pr_debug("read ptr address == 0x%016llX\n", + args.read_pointer_address); + + pr_debug("write ptr address == 0x%016llX\n", + args.write_pointer_address); + + return 0; + +err_copy_args_out: + pqm_destroy_queue(&p->pqm, queue_id); +err_create_queue: +err_bind_process: + mutex_unlock(&p->mutex); + return err; +} + +static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, + void __user *arg) +{ + int retval; + struct kfd_ioctl_destroy_queue_args args; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + pr_debug("kfd: destroying queue id %d for PASID %d\n", + args.queue_id, + p->pasid); + + mutex_lock(&p->mutex); + + retval = pqm_destroy_queue(&p->pqm, args.queue_id); + + mutex_unlock(&p->mutex); + return retval; +} + +static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + void __user *arg) +{ + int retval; + struct kfd_ioctl_update_queue_args args; + struct queue_properties properties; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + if (args.queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { + pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args.queue_priority > KFD_MAX_QUEUE_PRIORITY) { + pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + + if ((args.ring_base_address) && + (!access_ok(VERIFY_WRITE, + (const void __user *) args.ring_base_address, + sizeof(uint64_t)))) { + pr_err("kfd: can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args.ring_size) && (args.ring_size != 0)) { + pr_err("kfd: ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + properties.queue_address = args.ring_base_address; + properties.queue_size = args.ring_size; + properties.queue_percent = args.queue_percentage; + properties.priority = args.queue_priority; + + pr_debug("kfd: updating queue id %d for PASID %d\n", + args.queue_id, p->pasid); + + mutex_lock(&p->mutex); + + retval = pqm_update_queue(&p->pqm, args.queue_id, &properties); + + mutex_unlock(&p->mutex); + + return retval; +} + +static long kfd_ioctl_set_memory_policy(struct file *filep, + struct kfd_process *p, void __user *arg) +{ + struct kfd_ioctl_set_memory_policy_args args; + struct kfd_dev *dev; + int err = 0; + struct kfd_process_device *pdd; + enum cache_policy default_policy, alternate_policy; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + if (args.default_policy != KFD_IOC_CACHE_POLICY_COHERENT + && args.default_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) { + return -EINVAL; + } + + if (args.alternate_policy != KFD_IOC_CACHE_POLICY_COHERENT + && args.alternate_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) { + return -EINVAL; + } + + dev = kfd_device_by_id(args.gpu_id); + if (dev == NULL) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto out; + } + + default_policy = (args.default_policy == KFD_IOC_CACHE_POLICY_COHERENT) + ? cache_policy_coherent : cache_policy_noncoherent; + + alternate_policy = + (args.alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT) + ? cache_policy_coherent : cache_policy_noncoherent; + + if (!dev->dqm->set_cache_memory_policy(dev->dqm, + &pdd->qpd, + default_policy, + alternate_policy, + (void __user *)args.alternate_aperture_base, + args.alternate_aperture_size)) + err = -EINVAL; + +out: + mutex_unlock(&p->mutex); + + return err; +} + +static long kfd_ioctl_get_clock_counters(struct file *filep, + struct kfd_process *p, void __user *arg) +{ + struct kfd_ioctl_get_clock_counters_args args; + struct kfd_dev *dev; + struct timespec time; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + dev = kfd_device_by_id(args.gpu_id); + if (dev == NULL) + return -EINVAL; + + /* Reading GPU clock counter from KGD */ + args.gpu_clock_counter = kfd2kgd->get_gpu_clock_counter(dev->kgd); + + /* No access to rdtsc. Using raw monotonic time */ + getrawmonotonic(&time); + args.cpu_clock_counter = (uint64_t)timespec_to_ns(&time); + + get_monotonic_boottime(&time); + args.system_clock_counter = (uint64_t)timespec_to_ns(&time); + + /* Since the counter is in nano-seconds we use 1GHz frequency */ + args.system_clock_freq = 1000000000; + + if (copy_to_user(arg, &args, sizeof(args))) + return -EFAULT; + + return 0; +} + + +static int kfd_ioctl_get_process_apertures(struct file *filp, + struct kfd_process *p, void __user *arg) +{ + struct kfd_ioctl_get_process_apertures_args args; + struct kfd_process_device_apertures *pAperture; + struct kfd_process_device *pdd; + + dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + args.num_of_nodes = 0; + + mutex_lock(&p->mutex); + + /*if the process-device list isn't empty*/ + if (kfd_has_process_device_data(p)) { + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + pAperture = &args.process_apertures[args.num_of_nodes]; + pAperture->gpu_id = pdd->dev->id; + pAperture->lds_base = pdd->lds_base; + pAperture->lds_limit = pdd->lds_limit; + pAperture->gpuvm_base = pdd->gpuvm_base; + pAperture->gpuvm_limit = pdd->gpuvm_limit; + pAperture->scratch_base = pdd->scratch_base; + pAperture->scratch_limit = pdd->scratch_limit; + + dev_dbg(kfd_device, + "node id %u\n", args.num_of_nodes); + dev_dbg(kfd_device, + "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, + "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, + "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, + "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, + "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, + "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, + "scratch_limit %llX\n", pdd->scratch_limit); + + args.num_of_nodes++; + } while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && + (args.num_of_nodes < NUM_OF_SUPPORTED_GPUS)); + } + + mutex_unlock(&p->mutex); + + if (copy_to_user(arg, &args, sizeof(args))) + return -EFAULT; + + return 0; +} + +static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kfd_process *process; + long err = -EINVAL; + + dev_dbg(kfd_device, + "ioctl cmd 0x%x (#%d), arg 0x%lx\n", + cmd, _IOC_NR(cmd), arg); + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + + switch (cmd) { + case KFD_IOC_GET_VERSION: + err = kfd_ioctl_get_version(filep, process, (void __user *)arg); + break; + case KFD_IOC_CREATE_QUEUE: + err = kfd_ioctl_create_queue(filep, process, + (void __user *)arg); + break; + + case KFD_IOC_DESTROY_QUEUE: + err = kfd_ioctl_destroy_queue(filep, process, + (void __user *)arg); + break; + + case KFD_IOC_SET_MEMORY_POLICY: + err = kfd_ioctl_set_memory_policy(filep, process, + (void __user *)arg); + break; + + case KFD_IOC_GET_CLOCK_COUNTERS: + err = kfd_ioctl_get_clock_counters(filep, process, + (void __user *)arg); + break; + + case KFD_IOC_GET_PROCESS_APERTURES: + err = kfd_ioctl_get_process_apertures(filep, process, + (void __user *)arg); + break; + + case KFD_IOC_UPDATE_QUEUE: + err = kfd_ioctl_update_queue(filep, process, + (void __user *)arg); + break; + + default: + dev_err(kfd_device, + "unknown ioctl cmd 0x%x, arg 0x%lx)\n", + cmd, arg); + err = -EINVAL; + break; + } + + if (err < 0) + dev_err(kfd_device, + "ioctl error %ld for ioctl cmd 0x%x (#%d)\n", + err, cmd, _IOC_NR(cmd)); + + return err; +} + +static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct kfd_process *process; + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + + return kfd_doorbell_mmap(process, vma); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h new file mode 100644 index 000000000000..a374fa3d3ee6 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -0,0 +1,294 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_CRAT_H_INCLUDED +#define KFD_CRAT_H_INCLUDED + +#include <linux/types.h> + +#pragma pack(1) + +/* + * 4CC signature values for the CRAT and CDIT ACPI tables + */ + +#define CRAT_SIGNATURE "CRAT" +#define CDIT_SIGNATURE "CDIT" + +/* + * Component Resource Association Table (CRAT) + */ + +#define CRAT_OEMID_LENGTH 6 +#define CRAT_OEMTABLEID_LENGTH 8 +#define CRAT_RESERVED_LENGTH 6 + +#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) + +struct crat_header { + uint32_t signature; + uint32_t length; + uint8_t revision; + uint8_t checksum; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; + uint32_t creator_id; + uint32_t creator_revision; + uint32_t total_entries; + uint16_t num_domains; + uint8_t reserved[CRAT_RESERVED_LENGTH]; +}; + +/* + * The header structure is immediately followed by total_entries of the + * data definitions + */ + +/* + * The currently defined subtype entries in the CRAT + */ +#define CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY 0 +#define CRAT_SUBTYPE_MEMORY_AFFINITY 1 +#define CRAT_SUBTYPE_CACHE_AFFINITY 2 +#define CRAT_SUBTYPE_TLB_AFFINITY 3 +#define CRAT_SUBTYPE_CCOMPUTE_AFFINITY 4 +#define CRAT_SUBTYPE_IOLINK_AFFINITY 5 +#define CRAT_SUBTYPE_MAX 6 + +#define CRAT_SIBLINGMAP_SIZE 32 + +/* + * ComputeUnit Affinity structure and definitions + */ +#define CRAT_CU_FLAGS_ENABLED 0x00000001 +#define CRAT_CU_FLAGS_HOT_PLUGGABLE 0x00000002 +#define CRAT_CU_FLAGS_CPU_PRESENT 0x00000004 +#define CRAT_CU_FLAGS_GPU_PRESENT 0x00000008 +#define CRAT_CU_FLAGS_IOMMU_PRESENT 0x00000010 +#define CRAT_CU_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_COMPUTEUNIT_RESERVED_LENGTH 4 + +struct crat_subtype_computeunit { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain; + uint32_t processor_id_low; + uint16_t num_cpu_cores; + uint16_t num_simd_cores; + uint16_t max_waves_simd; + uint16_t io_count; + uint16_t hsa_capability; + uint16_t lds_size_in_kb; + uint8_t wave_front_size; + uint8_t num_banks; + uint16_t micro_engine_id; + uint8_t num_arrays; + uint8_t num_cu_per_array; + uint8_t num_simd_per_cu; + uint8_t max_slots_scatch_cu; + uint8_t reserved2[CRAT_COMPUTEUNIT_RESERVED_LENGTH]; +}; + +/* + * HSA Memory Affinity structure and definitions + */ +#define CRAT_MEM_FLAGS_ENABLED 0x00000001 +#define CRAT_MEM_FLAGS_HOT_PLUGGABLE 0x00000002 +#define CRAT_MEM_FLAGS_NON_VOLATILE 0x00000004 +#define CRAT_MEM_FLAGS_RESERVED 0xfffffff8 + +#define CRAT_MEMORY_RESERVED_LENGTH 8 + +struct crat_subtype_memory { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t promixity_domain; + uint32_t base_addr_low; + uint32_t base_addr_high; + uint32_t length_low; + uint32_t length_high; + uint32_t width; + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; +}; + +/* + * HSA Cache Affinity structure and definitions + */ +#define CRAT_CACHE_FLAGS_ENABLED 0x00000001 +#define CRAT_CACHE_FLAGS_DATA_CACHE 0x00000002 +#define CRAT_CACHE_FLAGS_INST_CACHE 0x00000004 +#define CRAT_CACHE_FLAGS_CPU_CACHE 0x00000008 +#define CRAT_CACHE_FLAGS_SIMD_CACHE 0x00000010 +#define CRAT_CACHE_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_CACHE_RESERVED_LENGTH 8 + +struct crat_subtype_cache { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t cache_size; + uint8_t cache_level; + uint8_t lines_per_tag; + uint16_t cache_line_size; + uint8_t associativity; + uint8_t cache_properties; + uint16_t cache_latency; + uint8_t reserved2[CRAT_CACHE_RESERVED_LENGTH]; +}; + +/* + * HSA TLB Affinity structure and definitions + */ +#define CRAT_TLB_FLAGS_ENABLED 0x00000001 +#define CRAT_TLB_FLAGS_DATA_TLB 0x00000002 +#define CRAT_TLB_FLAGS_INST_TLB 0x00000004 +#define CRAT_TLB_FLAGS_CPU_TLB 0x00000008 +#define CRAT_TLB_FLAGS_SIMD_TLB 0x00000010 +#define CRAT_TLB_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_TLB_RESERVED_LENGTH 4 + +struct crat_subtype_tlb { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t tlb_level; + uint8_t data_tlb_associativity_2mb; + uint8_t data_tlb_size_2mb; + uint8_t instruction_tlb_associativity_2mb; + uint8_t instruction_tlb_size_2mb; + uint8_t data_tlb_associativity_4k; + uint8_t data_tlb_size_4k; + uint8_t instruction_tlb_associativity_4k; + uint8_t instruction_tlb_size_4k; + uint8_t data_tlb_associativity_1gb; + uint8_t data_tlb_size_1gb; + uint8_t instruction_tlb_associativity_1gb; + uint8_t instruction_tlb_size_1gb; + uint8_t reserved2[CRAT_TLB_RESERVED_LENGTH]; +}; + +/* + * HSA CCompute/APU Affinity structure and definitions + */ +#define CRAT_CCOMPUTE_FLAGS_ENABLED 0x00000001 +#define CRAT_CCOMPUTE_FLAGS_RESERVED 0xfffffffe + +#define CRAT_CCOMPUTE_RESERVED_LENGTH 16 + +struct crat_subtype_ccompute { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t apu_size; + uint8_t reserved2[CRAT_CCOMPUTE_RESERVED_LENGTH]; +}; + +/* + * HSA IO Link Affinity structure and definitions + */ +#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 +#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 +#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc + +/* + * IO interface types + */ +#define CRAT_IOLINK_TYPE_UNDEFINED 0 +#define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 +#define CRAT_IOLINK_TYPE_PCIEXPRESS 2 +#define CRAT_IOLINK_TYPE_OTHER 3 +#define CRAT_IOLINK_TYPE_MAX 255 + +#define CRAT_IOLINK_RESERVED_LENGTH 24 + +struct crat_subtype_iolink { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain_from; + uint32_t proximity_domain_to; + uint8_t io_interface_type; + uint8_t version_major; + uint16_t version_minor; + uint32_t minimum_latency; + uint32_t maximum_latency; + uint32_t minimum_bandwidth_mbs; + uint32_t maximum_bandwidth_mbs; + uint32_t recommended_transfer_size; + uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH]; +}; + +/* + * HSA generic sub-type header + */ + +#define CRAT_SUBTYPE_FLAGS_ENABLED 0x00000001 + +struct crat_subtype_generic { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; +}; + +/* + * Component Locality Distance Information Table (CDIT) + */ +#define CDIT_OEMID_LENGTH 6 +#define CDIT_OEMTABLEID_LENGTH 8 + +struct cdit_header { + uint32_t signature; + uint32_t length; + uint8_t revision; + uint8_t checksum; + uint8_t oem_id[CDIT_OEMID_LENGTH]; + uint8_t oem_table_id[CDIT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; + uint32_t creator_id; + uint32_t creator_revision; + uint32_t total_entries; + uint16_t num_domains; + uint8_t entry[1]; +}; + +#pragma pack() + +#endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c new file mode 100644 index 000000000000..43884ebd4303 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -0,0 +1,308 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/amd-iommu.h> +#include <linux/bsearch.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" + +#define MQD_SIZE_ALIGNED 768 + +static const struct kfd_device_info kaveri_device_info = { + .max_pasid_bits = 16, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .mqd_size_aligned = MQD_SIZE_ALIGNED +}; + +struct kfd_deviceid { + unsigned short did; + const struct kfd_device_info *device_info; +}; + +/* Please keep this sorted by increasing device id. */ +static const struct kfd_deviceid supported_devices[] = { + { 0x1304, &kaveri_device_info }, /* Kaveri */ + { 0x1305, &kaveri_device_info }, /* Kaveri */ + { 0x1306, &kaveri_device_info }, /* Kaveri */ + { 0x1307, &kaveri_device_info }, /* Kaveri */ + { 0x1309, &kaveri_device_info }, /* Kaveri */ + { 0x130A, &kaveri_device_info }, /* Kaveri */ + { 0x130B, &kaveri_device_info }, /* Kaveri */ + { 0x130C, &kaveri_device_info }, /* Kaveri */ + { 0x130D, &kaveri_device_info }, /* Kaveri */ + { 0x130E, &kaveri_device_info }, /* Kaveri */ + { 0x130F, &kaveri_device_info }, /* Kaveri */ + { 0x1310, &kaveri_device_info }, /* Kaveri */ + { 0x1311, &kaveri_device_info }, /* Kaveri */ + { 0x1312, &kaveri_device_info }, /* Kaveri */ + { 0x1313, &kaveri_device_info }, /* Kaveri */ + { 0x1315, &kaveri_device_info }, /* Kaveri */ + { 0x1316, &kaveri_device_info }, /* Kaveri */ + { 0x1317, &kaveri_device_info }, /* Kaveri */ + { 0x1318, &kaveri_device_info }, /* Kaveri */ + { 0x131B, &kaveri_device_info }, /* Kaveri */ + { 0x131C, &kaveri_device_info }, /* Kaveri */ + { 0x131D, &kaveri_device_info }, /* Kaveri */ +}; + +static const struct kfd_device_info *lookup_device_info(unsigned short did) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { + if (supported_devices[i].did == did) { + BUG_ON(supported_devices[i].device_info == NULL); + return supported_devices[i].device_info; + } + } + + return NULL; +} + +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev) +{ + struct kfd_dev *kfd; + + const struct kfd_device_info *device_info = + lookup_device_info(pdev->device); + + if (!device_info) + return NULL; + + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) + return NULL; + + kfd->kgd = kgd; + kfd->device_info = device_info; + kfd->pdev = pdev; + kfd->init_complete = false; + + return kfd; +} + +static bool device_iommu_pasid_init(struct kfd_dev *kfd) +{ + const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | + AMD_IOMMU_DEVICE_FLAG_PRI_SUP | + AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + + struct amd_iommu_device_info iommu_info; + unsigned int pasid_limit; + int err; + + err = amd_iommu_device_info(kfd->pdev, &iommu_info); + if (err < 0) { + dev_err(kfd_device, + "error getting iommu info. is the iommu enabled?\n"); + return false; + } + + if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { + dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n", + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0); + return false; + } + + pasid_limit = min_t(unsigned int, + (unsigned int)1 << kfd->device_info->max_pasid_bits, + iommu_info.max_pasids); + /* + * last pasid is used for kernel queues doorbells + * in the future the last pasid might be used for a kernel thread. + */ + pasid_limit = min_t(unsigned int, + pasid_limit, + kfd->doorbell_process_limit - 1); + + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err < 0) { + dev_err(kfd_device, "error initializing iommu device\n"); + return false; + } + + if (!kfd_set_pasid_limit(pasid_limit)) { + dev_err(kfd_device, "error setting pasid limit\n"); + amd_iommu_free_device(kfd->pdev); + return false; + } + + return true; +} + +static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) +{ + struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); + + if (dev) + kfd_unbind_process_from_device(dev, pasid); +} + +bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources) +{ + unsigned int size; + + kfd->shared_resources = *gpu_resources; + + /* calculate max size of mqds needed for queues */ + size = max_num_of_processes * + max_num_of_queues_per_process * + kfd->device_info->mqd_size_aligned; + + /* add another 512KB for all other allocations on gart */ + size += 512 * 1024; + + if (kfd2kgd->init_sa_manager(kfd->kgd, size)) { + dev_err(kfd_device, + "Error initializing sa manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto out; + } + + kfd_doorbell_init(kfd); + + if (kfd_topology_add_device(kfd) != 0) { + dev_err(kfd_device, + "Error adding device (%x:%x) to topology\n", + kfd->pdev->vendor, kfd->pdev->device); + goto kfd_topology_add_device_error; + } + + if (kfd_interrupt_init(kfd)) { + dev_err(kfd_device, + "Error initializing interrupts for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto kfd_interrupt_error; + } + + if (!device_iommu_pasid_init(kfd)) { + dev_err(kfd_device, + "Error initializing iommuv2 for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_iommu_pasid_error; + } + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + + kfd->dqm = device_queue_manager_init(kfd); + if (!kfd->dqm) { + dev_err(kfd_device, + "Error initializing queue manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_queue_manager_error; + } + + if (kfd->dqm->start(kfd->dqm) != 0) { + dev_err(kfd_device, + "Error starting queuen manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto dqm_start_error; + } + + kfd->init_complete = true; + dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, + kfd->pdev->device); + + pr_debug("kfd: Starting kfd with the following scheduling policy %d\n", + sched_policy); + + goto out; + +dqm_start_error: + device_queue_manager_uninit(kfd->dqm); +device_queue_manager_error: + amd_iommu_free_device(kfd->pdev); +device_iommu_pasid_error: + kfd_interrupt_exit(kfd); +kfd_interrupt_error: + kfd_topology_remove_device(kfd); +kfd_topology_add_device_error: + kfd2kgd->fini_sa_manager(kfd->kgd); + dev_err(kfd_device, + "device (%x:%x) NOT added due to errors\n", + kfd->pdev->vendor, kfd->pdev->device); +out: + return kfd->init_complete; +} + +void kgd2kfd_device_exit(struct kfd_dev *kfd) +{ + if (kfd->init_complete) { + device_queue_manager_uninit(kfd->dqm); + amd_iommu_free_device(kfd->pdev); + kfd_interrupt_exit(kfd); + kfd_topology_remove_device(kfd); + } + + kfree(kfd); +} + +void kgd2kfd_suspend(struct kfd_dev *kfd) +{ + BUG_ON(kfd == NULL); + + if (kfd->init_complete) { + kfd->dqm->stop(kfd->dqm); + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); + } +} + +int kgd2kfd_resume(struct kfd_dev *kfd) +{ + unsigned int pasid_limit; + int err; + + BUG_ON(kfd == NULL); + + pasid_limit = kfd_get_pasid_limit(); + + if (kfd->init_complete) { + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err < 0) + return -ENXIO; + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + kfd->dqm->start(kfd->dqm); + } + + return 0; +} + +/* This is called directly from KGD at ISR. */ +void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) +{ + if (kfd->init_complete) { + spin_lock(&kfd->interrupt_lock); + + if (kfd->interrupts_active + && enqueue_ih_ring_entry(kfd, ih_ring_entry)) + schedule_work(&kfd->interrupt_work); + + spin_unlock(&kfd->interrupt_lock); + } +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c new file mode 100644 index 000000000000..924e90c072e5 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -0,0 +1,1062 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/types.h> +#include <linux/printk.h> +#include <linux/bitops.h> +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_mqd_manager.h" +#include "cik_regs.h" +#include "kfd_kernel_queue.h" +#include "../../radeon/cik_reg.h" + +/* Size of the per-pipe EOP queue */ +#define CIK_HPD_EOP_BYTES_LOG2 11 +#define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2) + +static bool is_mem_initialized; + +static int init_memory(struct device_queue_manager *dqm); +static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, + unsigned int pasid, unsigned int vmid); + +static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); +static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock); + + +static inline unsigned int get_pipes_num(struct device_queue_manager *dqm) +{ + BUG_ON(!dqm || !dqm->dev); + return dqm->dev->shared_resources.compute_pipe_count; +} + +static inline unsigned int get_first_pipe(struct device_queue_manager *dqm) +{ + BUG_ON(!dqm); + return dqm->dev->shared_resources.first_compute_pipe; +} + +static inline unsigned int get_pipes_num_cpsch(void) +{ + return PIPE_PER_ME_CP_SCHEDULING; +} + +static inline unsigned int +get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) +{ + uint32_t nybble; + + nybble = (pdd->lds_base >> 60) & 0x0E; + + return nybble; + +} + +static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) +{ + unsigned int shared_base; + + shared_base = (pdd->lds_base >> 16) & 0xFF; + + return shared_base; +} + +static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble); +static void init_process_memory(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + BUG_ON(!dqm || !qpd); + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | + DEFAULT_MTYPE(MTYPE_NONCACHED) | + APE1_MTYPE(MTYPE_NONCACHED); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + if (qpd->pqm->process->is_32bit_user_mode) { + temp = get_sh_mem_bases_32(pdd); + qpd->sh_mem_bases = SHARED_BASE(temp); + qpd->sh_mem_config |= PTR32; + } else { + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + } + + pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); +} + +static void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + return kfd2kgd->program_sh_mem_settings(dqm->dev->kgd, qpd->vmid, + qpd->sh_mem_config, + qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit, + qpd->sh_mem_bases); +} + +static int allocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int bit, allocated_vmid; + + if (dqm->vmid_bitmap == 0) + return -ENOMEM; + + bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); + clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + + /* Kaveri kfd vmid's starts from vmid 8 */ + allocated_vmid = bit + KFD_VMID_START_OFFSET; + pr_debug("kfd: vmid allocation %d\n", allocated_vmid); + qpd->vmid = allocated_vmid; + q->properties.vmid = allocated_vmid; + + set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); + program_sh_mem_settings(dqm, qpd); + + return 0; +} + +static void deallocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int bit = qpd->vmid - KFD_VMID_START_OFFSET; + + set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + qpd->vmid = 0; + q->properties.vmid = 0; +} + +static int create_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd, + int *allocated_vmid) +{ + int retval; + + BUG_ON(!dqm || !q || !qpd || !allocated_vmid); + + pr_debug("kfd: In func %s\n", __func__); + print_queue(q); + + mutex_lock(&dqm->lock); + + if (list_empty(&qpd->queues_list)) { + retval = allocate_vmid(dqm, qpd, q); + if (retval != 0) { + mutex_unlock(&dqm->lock); + return retval; + } + } + *allocated_vmid = qpd->vmid; + q->properties.vmid = qpd->vmid; + + retval = create_compute_queue_nocpsch(dqm, q, qpd); + + if (retval != 0) { + if (list_empty(&qpd->queues_list)) { + deallocate_vmid(dqm, qpd, q); + *allocated_vmid = 0; + } + mutex_unlock(&dqm->lock); + return retval; + } + + list_add(&q->list, &qpd->queues_list); + dqm->queue_count++; + + mutex_unlock(&dqm->lock); + return 0; +} + +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) +{ + bool set; + int pipe, bit; + + set = false; + + for (pipe = dqm->next_pipe_to_allocate; pipe < get_pipes_num(dqm); + pipe = (pipe + 1) % get_pipes_num(dqm)) { + if (dqm->allocated_queues[pipe] != 0) { + bit = find_first_bit( + (unsigned long *)&dqm->allocated_queues[pipe], + QUEUES_PER_PIPE); + + clear_bit(bit, + (unsigned long *)&dqm->allocated_queues[pipe]); + q->pipe = pipe; + q->queue = bit; + set = true; + break; + } + } + + if (set == false) + return -EBUSY; + + pr_debug("kfd: DQM %s hqd slot - pipe (%d) queue(%d)\n", + __func__, q->pipe, q->queue); + /* horizontal hqd allocation */ + dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_num(dqm); + + return 0; +} + +static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q) +{ + set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); +} + +static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !qpd); + + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_COMPUTE); + if (mqd == NULL) + return -ENOMEM; + + retval = allocate_hqd(dqm, q); + if (retval != 0) + return retval; + + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) { + deallocate_hqd(dqm, q); + return retval; + } + + return 0; +} + +static int destroy_queue_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !q->mqd || !qpd); + + retval = 0; + + pr_debug("kfd: In Func %s\n", __func__); + + mutex_lock(&dqm->lock); + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_COMPUTE); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } + + retval = mqd->destroy_mqd(mqd, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + q->pipe, q->queue); + + if (retval != 0) + goto out; + + deallocate_hqd(dqm, q); + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + + list_del(&q->list); + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); + dqm->queue_count--; +out: + mutex_unlock(&dqm->lock); + return retval; +} + +static int update_queue(struct device_queue_manager *dqm, struct queue *q) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !q->mqd); + + mutex_lock(&dqm->lock); + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_COMPUTE); + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if (q->properties.is_active == true) + dqm->queue_count++; + else + dqm->queue_count--; + + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm, false); + + mutex_unlock(&dqm->lock); + return retval; +} + +static struct mqd_manager *get_mqd_manager_nocpsch( + struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) +{ + struct mqd_manager *mqd; + + BUG_ON(!dqm || type >= KFD_MQD_TYPE_MAX); + + pr_debug("kfd: In func %s mqd type %d\n", __func__, type); + + mqd = dqm->mqds[type]; + if (!mqd) { + mqd = mqd_manager_init(type, dqm->dev); + if (mqd == NULL) + pr_err("kfd: mqd manager is NULL"); + dqm->mqds[type] = mqd; + } + + return mqd; +} + +static int register_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct device_process_node *n; + + BUG_ON(!dqm || !qpd); + + pr_debug("kfd: In func %s\n", __func__); + + n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); + if (!n) + return -ENOMEM; + + n->qpd = qpd; + + mutex_lock(&dqm->lock); + list_add(&n->list, &dqm->queues); + + init_process_memory(dqm, qpd); + dqm->processes_count++; + + mutex_unlock(&dqm->lock); + + return 0; +} + +static int unregister_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + int retval; + struct device_process_node *cur, *next; + + BUG_ON(!dqm || !qpd); + + BUG_ON(!list_empty(&qpd->queues_list)); + + pr_debug("kfd: In func %s\n", __func__); + + retval = 0; + mutex_lock(&dqm->lock); + + list_for_each_entry_safe(cur, next, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + dqm->processes_count--; + goto out; + } + } + /* qpd not found in dqm list */ + retval = 1; +out: + mutex_unlock(&dqm->lock); + return retval; +} + +static int +set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid, + unsigned int vmid) +{ + uint32_t pasid_mapping; + + pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | + ATC_VMID_PASID_MAPPING_VALID; + return kfd2kgd->set_pasid_vmid_mapping(dqm->dev->kgd, pasid_mapping, + vmid); +} + +static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) +{ + /* In 64-bit mode, we can only control the top 3 bits of the LDS, + * scratch and GPUVM apertures. + * The hardware fills in the remaining 59 bits according to the + * following pattern: + * LDS: X0000000'00000000 - X0000001'00000000 (4GB) + * Scratch: X0000001'00000000 - X0000002'00000000 (4GB) + * GPUVM: Y0010000'00000000 - Y0020000'00000000 (1TB) + * + * (where X/Y is the configurable nybble with the low-bit 0) + * + * LDS and scratch will have the same top nybble programmed in the + * top 3 bits of SH_MEM_BASES.PRIVATE_BASE. + * GPUVM can have a different top nybble programmed in the + * top 3 bits of SH_MEM_BASES.SHARED_BASE. + * We don't bother to support different top nybbles + * for LDS/Scratch and GPUVM. + */ + + BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + top_address_nybble == 0); + + return PRIVATE_BASE(top_address_nybble << 12) | + SHARED_BASE(top_address_nybble << 12); +} + +static int init_memory(struct device_queue_manager *dqm) +{ + int i, retval; + + for (i = 8; i < 16; i++) + set_pasid_vmid_mapping(dqm, 0, i); + + retval = kfd2kgd->init_memory(dqm->dev->kgd); + if (retval == 0) + is_mem_initialized = true; + return retval; +} + + +static int init_pipelines(struct device_queue_manager *dqm, + unsigned int pipes_num, unsigned int first_pipe) +{ + void *hpdptr; + struct mqd_manager *mqd; + unsigned int i, err, inx; + uint64_t pipe_hpd_addr; + + BUG_ON(!dqm || !dqm->dev); + + pr_debug("kfd: In func %s\n", __func__); + + /* + * Allocate memory for the HPDs. This is hardware-owned per-pipe data. + * The driver never accesses this memory after zeroing it. + * It doesn't even have to be saved/restored on suspend/resume + * because it contains no data when there are no active queues. + */ + + err = kfd2kgd->allocate_mem(dqm->dev->kgd, + CIK_HPD_EOP_BYTES * pipes_num, + PAGE_SIZE, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &dqm->pipeline_mem); + + if (err) { + pr_err("kfd: error allocate vidmem num pipes: %d\n", + pipes_num); + return -ENOMEM; + } + + hpdptr = dqm->pipeline_mem->cpu_ptr; + dqm->pipelines_addr = dqm->pipeline_mem->gpu_addr; + + memset(hpdptr, 0, CIK_HPD_EOP_BYTES * pipes_num); + + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_COMPUTE); + if (mqd == NULL) { + kfd2kgd->free_mem(dqm->dev->kgd, + (struct kgd_mem *) dqm->pipeline_mem); + return -ENOMEM; + } + + for (i = 0; i < pipes_num; i++) { + inx = i + first_pipe; + pipe_hpd_addr = dqm->pipelines_addr + i * CIK_HPD_EOP_BYTES; + pr_debug("kfd: pipeline address %llX\n", pipe_hpd_addr); + /* = log2(bytes/4)-1 */ + kfd2kgd->init_pipeline(dqm->dev->kgd, i, + CIK_HPD_EOP_BYTES_LOG2 - 3, pipe_hpd_addr); + } + + return 0; +} + + +static int init_scheduler(struct device_queue_manager *dqm) +{ + int retval; + + BUG_ON(!dqm); + + pr_debug("kfd: In %s\n", __func__); + + retval = init_pipelines(dqm, get_pipes_num(dqm), KFD_DQM_FIRST_PIPE); + if (retval != 0) + return retval; + + retval = init_memory(dqm); + + return retval; +} + +static int initialize_nocpsch(struct device_queue_manager *dqm) +{ + int i; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_num(dqm)); + + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->next_pipe_to_allocate = 0; + dqm->allocated_queues = kcalloc(get_pipes_num(dqm), + sizeof(unsigned int), GFP_KERNEL); + if (!dqm->allocated_queues) { + mutex_destroy(&dqm->lock); + return -ENOMEM; + } + + for (i = 0; i < get_pipes_num(dqm); i++) + dqm->allocated_queues[i] = (1 << QUEUES_PER_PIPE) - 1; + + dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; + + init_scheduler(dqm); + return 0; +} + +static void uninitialize_nocpsch(struct device_queue_manager *dqm) +{ + int i; + + BUG_ON(!dqm); + + BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); + + kfree(dqm->allocated_queues); + for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) + kfree(dqm->mqds[i]); + mutex_destroy(&dqm->lock); + kfd2kgd->free_mem(dqm->dev->kgd, + (struct kgd_mem *) dqm->pipeline_mem); +} + +static int start_nocpsch(struct device_queue_manager *dqm) +{ + return 0; +} + +static int stop_nocpsch(struct device_queue_manager *dqm) +{ + return 0; +} + +/* + * Device Queue Manager implementation for cp scheduler + */ + +static int set_sched_resources(struct device_queue_manager *dqm) +{ + struct scheduling_resources res; + unsigned int queue_num, queue_mask; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s\n", __func__); + + queue_num = get_pipes_num_cpsch() * QUEUES_PER_PIPE; + queue_mask = (1 << queue_num) - 1; + res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; + res.vmid_mask <<= KFD_VMID_START_OFFSET; + res.queue_mask = queue_mask << (get_first_pipe(dqm) * QUEUES_PER_PIPE); + res.gws_mask = res.oac_mask = res.gds_heap_base = + res.gds_heap_size = 0; + + pr_debug("kfd: scheduling resources:\n" + " vmid mask: 0x%8X\n" + " queue mask: 0x%8llX\n", + res.vmid_mask, res.queue_mask); + + return pm_send_set_resources(&dqm->packets, &res); +} + +static int initialize_cpsch(struct device_queue_manager *dqm) +{ + int retval; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_num_cpsch()); + + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->processes_count = 0; + dqm->active_runlist = false; + retval = init_pipelines(dqm, get_pipes_num(dqm), 0); + if (retval != 0) + goto fail_init_pipelines; + + return 0; + +fail_init_pipelines: + mutex_destroy(&dqm->lock); + return retval; +} + +static int start_cpsch(struct device_queue_manager *dqm) +{ + struct device_process_node *node; + int retval; + + BUG_ON(!dqm); + + retval = 0; + + retval = pm_init(&dqm->packets, dqm); + if (retval != 0) + goto fail_packet_manager_init; + + retval = set_sched_resources(dqm); + if (retval != 0) + goto fail_set_sched_resources; + + pr_debug("kfd: allocating fence memory\n"); + + /* allocate fence memory on the gart */ + retval = kfd2kgd->allocate_mem(dqm->dev->kgd, + sizeof(*dqm->fence_addr), + 32, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &dqm->fence_mem); + + if (retval != 0) + goto fail_allocate_vidmem; + + dqm->fence_addr = dqm->fence_mem->cpu_ptr; + dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr; + + list_for_each_entry(node, &dqm->queues, list) + if (node->qpd->pqm->process && dqm->dev) + kfd_bind_process_to_device(dqm->dev, + node->qpd->pqm->process); + + execute_queues_cpsch(dqm, true); + + return 0; +fail_allocate_vidmem: +fail_set_sched_resources: + pm_uninit(&dqm->packets); +fail_packet_manager_init: + return retval; +} + +static int stop_cpsch(struct device_queue_manager *dqm) +{ + struct device_process_node *node; + struct kfd_process_device *pdd; + + BUG_ON(!dqm); + + destroy_queues_cpsch(dqm, true); + + list_for_each_entry(node, &dqm->queues, list) { + pdd = qpd_to_pdd(node->qpd); + pdd->bound = false; + } + kfd2kgd->free_mem(dqm->dev->kgd, + (struct kgd_mem *) dqm->fence_mem); + pm_uninit(&dqm->packets); + + return 0; +} + +static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) +{ + BUG_ON(!dqm || !kq || !qpd); + + pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&dqm->lock); + list_add(&kq->list, &qpd->priv_queue_list); + dqm->queue_count++; + qpd->is_debug = true; + execute_queues_cpsch(dqm, false); + mutex_unlock(&dqm->lock); + + return 0; +} + +static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) +{ + BUG_ON(!dqm || !kq); + + pr_debug("kfd: In %s\n", __func__); + + mutex_lock(&dqm->lock); + destroy_queues_cpsch(dqm, false); + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; + execute_queues_cpsch(dqm, false); + mutex_unlock(&dqm->lock); +} + +static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd, int *allocate_vmid) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !qpd); + + retval = 0; + + if (allocate_vmid) + *allocate_vmid = 0; + + mutex_lock(&dqm->lock); + + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_CP); + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) + goto out; + + list_add(&q->list, &qpd->queues_list); + if (q->properties.is_active) { + dqm->queue_count++; + retval = execute_queues_cpsch(dqm, false); + } + +out: + mutex_unlock(&dqm->lock); + return retval; +} + +static int fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, + unsigned long timeout) +{ + BUG_ON(!fence_addr); + timeout += jiffies; + + while (*fence_addr != fence_value) { + if (time_after(jiffies, timeout)) { + pr_err("kfd: qcm fence wait loop timeout expired\n"); + return -ETIME; + } + cpu_relax(); + } + + return 0; +} + +static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock) +{ + int retval; + + BUG_ON(!dqm); + + retval = 0; + + if (lock) + mutex_lock(&dqm->lock); + if (dqm->active_runlist == false) + goto out; + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, 0, false, 0); + if (retval != 0) + goto out; + + *dqm->fence_addr = KFD_FENCE_INIT; + pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, + KFD_FENCE_COMPLETED); + /* should be timed out */ + fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + pm_release_ib(&dqm->packets); + dqm->active_runlist = false; + +out: + if (lock) + mutex_unlock(&dqm->lock); + return retval; +} + +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) +{ + int retval; + + BUG_ON(!dqm); + + if (lock) + mutex_lock(&dqm->lock); + + retval = destroy_queues_cpsch(dqm, false); + if (retval != 0) { + pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); + goto out; + } + + if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { + retval = 0; + goto out; + } + + if (dqm->active_runlist) { + retval = 0; + goto out; + } + + retval = pm_send_runlist(&dqm->packets, &dqm->queues); + if (retval != 0) { + pr_err("kfd: failed to execute runlist"); + goto out; + } + dqm->active_runlist = true; + +out: + if (lock) + mutex_unlock(&dqm->lock); + return retval; +} + +static int destroy_queue_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !qpd || !q); + + retval = 0; + + /* remove queue from list to prevent rescheduling after preemption */ + mutex_lock(&dqm->lock); + + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_CP); + if (!mqd) { + retval = -ENOMEM; + goto failed; + } + + list_del(&q->list); + dqm->queue_count--; + + execute_queues_cpsch(dqm, false); + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + + mutex_unlock(&dqm->lock); + + return 0; + +failed: + mutex_unlock(&dqm->lock); + return retval; +} + +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + +static bool set_cache_memory_policy(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + uint32_t default_mtype; + uint32_t ape1_mtype; + + pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&dqm->lock); + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base) + goto out; + + if ((base & APE1_FIXED_BITS_MASK) != 0) + goto out; + + if ((limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) + goto out; + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } + + default_mtype = (default_policy == cache_policy_coherent) ? + MTYPE_NONCACHED : + MTYPE_CACHED; + + ape1_mtype = (alternate_policy == cache_policy_coherent) ? + MTYPE_NONCACHED : + MTYPE_CACHED; + + qpd->sh_mem_config = (qpd->sh_mem_config & PTR32) + | ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) + | DEFAULT_MTYPE(default_mtype) + | APE1_MTYPE(ape1_mtype); + + if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + program_sh_mem_settings(dqm, qpd); + + pr_debug("kfd: sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", + qpd->sh_mem_config, qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit); + + mutex_unlock(&dqm->lock); + return true; + +out: + mutex_unlock(&dqm->lock); + return false; +} + +struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) +{ + struct device_queue_manager *dqm; + + BUG_ON(!dev); + + dqm = kzalloc(sizeof(struct device_queue_manager), GFP_KERNEL); + if (!dqm) + return NULL; + + dqm->dev = dev; + switch (sched_policy) { + case KFD_SCHED_POLICY_HWS: + case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: + /* initialize dqm for cp scheduling */ + dqm->create_queue = create_queue_cpsch; + dqm->initialize = initialize_cpsch; + dqm->start = start_cpsch; + dqm->stop = stop_cpsch; + dqm->destroy_queue = destroy_queue_cpsch; + dqm->update_queue = update_queue; + dqm->get_mqd_manager = get_mqd_manager_nocpsch; + dqm->register_process = register_process_nocpsch; + dqm->unregister_process = unregister_process_nocpsch; + dqm->uninitialize = uninitialize_nocpsch; + dqm->create_kernel_queue = create_kernel_queue_cpsch; + dqm->destroy_kernel_queue = destroy_kernel_queue_cpsch; + dqm->set_cache_memory_policy = set_cache_memory_policy; + break; + case KFD_SCHED_POLICY_NO_HWS: + /* initialize dqm for no cp scheduling */ + dqm->start = start_nocpsch; + dqm->stop = stop_nocpsch; + dqm->create_queue = create_queue_nocpsch; + dqm->destroy_queue = destroy_queue_nocpsch; + dqm->update_queue = update_queue; + dqm->get_mqd_manager = get_mqd_manager_nocpsch; + dqm->register_process = register_process_nocpsch; + dqm->unregister_process = unregister_process_nocpsch; + dqm->initialize = initialize_nocpsch; + dqm->uninitialize = uninitialize_nocpsch; + dqm->set_cache_memory_policy = set_cache_memory_policy; + break; + default: + BUG(); + break; + } + + if (dqm->initialize(dqm) != 0) { + kfree(dqm); + return NULL; + } + + return dqm; +} + +void device_queue_manager_uninit(struct device_queue_manager *dqm) +{ + BUG_ON(!dqm); + + dqm->uninitialize(dqm); + kfree(dqm); +} + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h new file mode 100644 index 000000000000..c3f189e8ae35 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -0,0 +1,146 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_DEVICE_QUEUE_MANAGER_H_ +#define KFD_DEVICE_QUEUE_MANAGER_H_ + +#include <linux/rwsem.h> +#include <linux/list.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" + +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) +#define QUEUES_PER_PIPE (8) +#define PIPE_PER_ME_CP_SCHEDULING (3) +#define CIK_VMID_NUM (8) +#define KFD_VMID_START_OFFSET (8) +#define VMID_PER_DEVICE CIK_VMID_NUM +#define KFD_DQM_FIRST_PIPE (0) + +struct device_process_node { + struct qcm_process_device *qpd; + struct list_head list; +}; + +/** + * struct device_queue_manager + * + * @create_queue: Queue creation routine. + * + * @destroy_queue: Queue destruction routine. + * + * @update_queue: Queue update routine. + * + * @get_mqd_manager: Returns the mqd manager according to the mqd type. + * + * @exeute_queues: Dispatches the queues list to the H/W. + * + * @register_process: This routine associates a specific process with device. + * + * @unregister_process: destroys the associations between process to device. + * + * @initialize: Initializes the pipelines and memory module for that device. + * + * @start: Initializes the resources/modules the the device needs for queues + * execution. This function is called on device initialization and after the + * system woke up after suspension. + * + * @stop: This routine stops execution of all the active queue running on the + * H/W and basically this function called on system suspend. + * + * @uninitialize: Destroys all the device queue manager resources allocated in + * initialize routine. + * + * @create_kernel_queue: Creates kernel queue. Used for debug queue. + * + * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue. + * + * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the + * memory apertures. + * + * This struct is a base class for the kfd queues scheduler in the + * device level. The device base class should expose the basic operations + * for queue creation and queue destruction. This base class hides the + * scheduling mode of the driver and the specific implementation of the + * concrete device. This class is the only class in the queues scheduler + * that configures the H/W. + */ + +struct device_queue_manager { + int (*create_queue)(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd, + int *allocate_vmid); + int (*destroy_queue)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q); + int (*update_queue)(struct device_queue_manager *dqm, + struct queue *q); + + struct mqd_manager * (*get_mqd_manager) + (struct device_queue_manager *dqm, + enum KFD_MQD_TYPE type); + + int (*register_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + int (*unregister_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + int (*initialize)(struct device_queue_manager *dqm); + int (*start)(struct device_queue_manager *dqm); + int (*stop)(struct device_queue_manager *dqm); + void (*uninitialize)(struct device_queue_manager *dqm); + int (*create_kernel_queue)(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd); + void (*destroy_kernel_queue)(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd); + bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); + + + struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; + struct packet_manager packets; + struct kfd_dev *dev; + struct mutex lock; + struct list_head queues; + unsigned int processes_count; + unsigned int queue_count; + unsigned int next_pipe_to_allocate; + unsigned int *allocated_queues; + unsigned int vmid_bitmap; + uint64_t pipelines_addr; + struct kfd_mem_obj *pipeline_mem; + uint64_t fence_gpu_addr; + unsigned int *fence_addr; + struct kfd_mem_obj *fence_mem; + bool active_runlist; +}; + + + +#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c new file mode 100644 index 000000000000..b5791a5c7c06 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -0,0 +1,256 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "kfd_priv.h" +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/io.h> + +/* + * This extension supports a kernel level doorbells management for + * the kernel queues. + * Basically the last doorbells page is devoted to kernel queues + * and that's assures that any user process won't get access to the + * kernel doorbells page + */ +static DEFINE_MUTEX(doorbell_mutex); +static unsigned long doorbell_available_index[ + DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)] = { 0 }; + +#define KERNEL_DOORBELL_PASID 1 +#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 + +/* + * Each device exposes a doorbell aperture, a PCI MMIO aperture that + * receives 32-bit writes that are passed to queues as wptr values. + * The doorbells are intended to be written by applications as part + * of queueing work on user-mode queues. + * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks. + * We map the doorbell address space into user-mode when a process creates + * its first queue on each device. + * Although the mapping is done by KFD, it is equivalent to an mmap of + * the /dev/kfd with the particular device encoded in the mmap offset. + * There will be other uses for mmap of /dev/kfd, so only a range of + * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells. + */ + +/* # of doorbell bytes allocated for each process. */ +static inline size_t doorbell_process_allocation(void) +{ + return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + PAGE_SIZE); +} + +/* Doorbell calculations for device init. */ +void kfd_doorbell_init(struct kfd_dev *kfd) +{ + size_t doorbell_start_offset; + size_t doorbell_aperture_size; + size_t doorbell_process_limit; + + /* + * We start with calculations in bytes because the input data might + * only be byte-aligned. + * Only after we have done the rounding can we assume any alignment. + */ + + doorbell_start_offset = + roundup(kfd->shared_resources.doorbell_start_offset, + doorbell_process_allocation()); + + doorbell_aperture_size = + rounddown(kfd->shared_resources.doorbell_aperture_size, + doorbell_process_allocation()); + + if (doorbell_aperture_size > doorbell_start_offset) + doorbell_process_limit = + (doorbell_aperture_size - doorbell_start_offset) / + doorbell_process_allocation(); + else + doorbell_process_limit = 0; + + kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + + doorbell_start_offset; + + kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); + kfd->doorbell_process_limit = doorbell_process_limit - 1; + + kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, + doorbell_process_allocation()); + + BUG_ON(!kfd->doorbell_kernel_ptr); + + pr_debug("kfd: doorbell initialization:\n"); + pr_debug("kfd: doorbell base == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + + pr_debug("kfd: doorbell_id_offset == 0x%08lX\n", + kfd->doorbell_id_offset); + + pr_debug("kfd: doorbell_process_limit == 0x%08lX\n", + doorbell_process_limit); + + pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + + pr_debug("kfd: doorbell aperture size == 0x%08lX\n", + kfd->shared_resources.doorbell_aperture_size); + + pr_debug("kfd: doorbell kernel address == 0x%08lX\n", + (uintptr_t)kfd->doorbell_kernel_ptr); +} + +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) +{ + phys_addr_t address; + struct kfd_dev *dev; + + /* + * For simplicitly we only allow mapping of the entire doorbell + * allocation of a single device & process. + */ + if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) + return -EINVAL; + + /* Find kfd device according to gpu id */ + dev = kfd_device_by_id(vma->vm_pgoff); + if (dev == NULL) + return -EINVAL; + + /* Find if pdd exists for combination of process and gpu id */ + if (!kfd_get_process_device_data(dev, process, 0)) + return -EINVAL; + + /* Calculate physical address of doorbell */ + address = kfd_get_process_doorbells(dev, process); + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + (unsigned long long) vma->vm_start, address, vma->vm_flags, + doorbell_process_allocation()); + + + return io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, + doorbell_process_allocation(), + vma->vm_page_prot); +} + + +/* get kernel iomem pointer for a doorbell */ +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off) +{ + u32 inx; + + BUG_ON(!kfd || !doorbell_off); + + mutex_lock(&doorbell_mutex); + inx = find_first_zero_bit(doorbell_available_index, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + __set_bit(inx, doorbell_available_index); + mutex_unlock(&doorbell_mutex); + + if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return NULL; + + /* + * Calculating the kernel doorbell offset using "faked" kernel + * pasid that allocated for kernel queues only + */ + *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / + sizeof(u32)) + inx; + + pr_debug("kfd: get kernel queue doorbell\n" + " doorbell offset == 0x%08d\n" + " kernel address == 0x%08lX\n", + *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); + + return kfd->doorbell_kernel_ptr + inx; +} + +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) +{ + unsigned int inx; + + BUG_ON(!kfd || !db_addr); + + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); + + mutex_lock(&doorbell_mutex); + __clear_bit(inx, doorbell_available_index); + mutex_unlock(&doorbell_mutex); +} + +inline void write_kernel_doorbell(u32 __iomem *db, u32 value) +{ + if (db) { + writel(value, db); + pr_debug("writing %d to doorbell address 0x%p\n", value, db); + } +} + +/* + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 + * to doorbells with the process's doorbell page + */ +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, + struct kfd_process *process, + unsigned int queue_id) +{ + /* + * doorbell_id_offset accounts for doorbells taken by KGD. + * pasid * doorbell_process_allocation/sizeof(u32) adjusts + * to the process's doorbells + */ + return kfd->doorbell_id_offset + + process->pasid * (doorbell_process_allocation()/sizeof(u32)) + + queue_id; +} + +uint64_t kfd_get_number_elems(struct kfd_dev *kfd) +{ + uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - + kfd->shared_resources.doorbell_start_offset) / + doorbell_process_allocation() + 1; + + return num_of_elems; + +} + +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process) +{ + return dev->doorbell_base + + process->pasid * doorbell_process_allocation(); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c new file mode 100644 index 000000000000..66df4da01c29 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -0,0 +1,356 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/device.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <uapi/linux/kfd_ioctl.h> +#include <linux/time.h> +#include "kfd_priv.h" +#include <linux/mm.h> +#include <uapi/asm-generic/mman-common.h> +#include <asm/processor.h> + +/* + * The primary memory I/O features being added for revisions of gfxip + * beyond 7.0 (Kaveri) are: + * + * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b + * + * “Flat” shader memory access – These are new shader vector memory + * operations that do not reference a T#/V# so a “pointer” is what is + * sourced from the vector gprs for direct access to memory. + * This pointer space has the Shared(LDS) and Private(Scratch) memory + * mapped into this pointer space as apertures. + * The hardware then determines how to direct the memory request + * based on what apertures the request falls in. + * + * Unaligned support and alignment check + * + * + * System Unified Address - SUA + * + * The standard usage for GPU virtual addresses are that they are mapped by + * a set of page tables we call GPUVM and these page tables are managed by + * a combination of vidMM/driver software components. The current virtual + * address (VA) range for GPUVM is 40b. + * + * As of gfxip7.1 and beyond we’re adding the ability for compute memory + * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access + * the same page tables used by host x86 processors and that are managed by + * the operating system. This is via a technique and hardware called ATC/IOMMU. + * The GPU has the capability of accessing both the GPUVM and ATC address + * spaces for a given VMID (process) simultaneously and we call this feature + * system unified address (SUA). + * + * There are three fundamental address modes of operation for a given VMID + * (process) on the GPU: + * + * HSA64 – 64b pointers and the default address space is ATC + * HSA32 – 32b pointers and the default address space is ATC + * GPUVM – 64b pointers and the default address space is GPUVM (driver + * model mode) + * + * + * HSA64 - ATC/IOMMU 64b + * + * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized + * by the CPU so an AMD CPU can only access the high area + * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space + * so the actual VA carried to translation is 48b. There is a “hole” in + * the middle of the 64b VA space. + * + * The GPU not only has access to all of the CPU accessible address space via + * ATC/IOMMU, but it also has access to the GPUVM address space. The “system + * unified address” feature (SUA) is the mapping of GPUVM and ATC address + * spaces into a unified pointer space. The method we take for 64b mode is + * to map the full 40b GPUVM address space into the hole of the 64b address + * space. + + * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we + * direct requests to be translated via GPUVM page tables instead of the + * IOMMU path. + * + * + * 64b to 49b Address conversion + * + * Note that there are still significant portions of unused regions (holes) + * in the 64b address space even for the GPU. There are several places in + * the pipeline (sw and hw), we wish to compress the 64b virtual address + * to a 49b address. This 49b address is constituted of an “ATC” bit + * plus a 48b virtual address. This 49b address is what is passed to the + * translation hardware. ATC==0 means the 48b address is a GPUVM address + * (max of 2^40 – 1) intended to be translated via GPUVM page tables. + * ATC==1 means the 48b address is intended to be translated via IOMMU + * page tables. + * + * A 64b pointer is compared to the apertures that are defined (Base/Limit), in + * this case the GPUVM aperture (red) is defined and if a pointer falls in this + * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero + * as part of the 64b to 49b conversion. + * + * Where this 64b to 49b conversion is done is a function of the usage. + * Most GPU memory access is via memory objects where the driver builds + * a descriptor which consists of a base address and a memory access by + * the GPU usually consists of some kind of an offset or Cartesian coordinate + * that references this memory descriptor. This is the case for shader + * instructions that reference the T# or V# constants, or for specified + * locations of assets (ex. the shader program location). In these cases + * the driver is what handles the 64b to 49b conversion and the base + * address in the descriptor (ex. V# or T# or shader program location) + * is defined as a 48b address w/ an ATC bit. For this usage a given + * memory object cannot straddle multiple apertures in the 64b address + * space. For example a shader program cannot jump in/out between ATC + * and GPUVM space. + * + * In some cases we wish to pass a 64b pointer to the GPU hardware and + * the GPU hw does the 64b to 49b conversion before passing memory + * requests to the cache/memory system. This is the case for the + * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers + * in scalar and vector GPRs respectively. + * + * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip + * hardware sends a 48b address along w/ an ATC bit, to the memory controller + * on the memory request interfaces. + * + * <client>_MC_rdreq_atc // read request ATC bit + * + * 0 : <client>_MC_rdreq_addr is a GPUVM VA + * + * 1 : <client>_MC_rdreq_addr is a ATC VA + * + * + * “Spare” aperture (APE1) + * + * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use + * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the + * config tables for setting cache policies. The “spare” (APE1) aperture is + * motivated by getting a different Mtype from the default. + * The default aperture isn’t an actual base/limit aperture; it is just the + * address space that doesn’t hit any defined base/limit apertures. + * The following diagram is a complete picture of the gfxip7.x SUA apertures. + * The APE1 can be placed either below or above + * the hole (cannot be in the hole). + * + * + * General Aperture definitions and rules + * + * An aperture register definition consists of a Base, Limit, Mtype, and + * usually an ATC bit indicating which translation tables that aperture uses. + * In all cases (for SUA and DUA apertures discussed later), aperture base + * and limit definitions are 64KB aligned. + * + * <ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 } + * + * <ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF } + * + * The base and limit are considered inclusive to an aperture so being + * inside an aperture means (address >= Base) AND (address <= Limit). + * + * In no case is a payload that straddles multiple apertures expected to work. + * For example a load_dword_x4 that starts in one aperture and ends in another, + * does not work. For the vector FLAT_* ops we have detection capability in + * the shader for reporting a “memory violation” back to the + * SQ block for use in traps. + * A memory violation results when an op falls into the hole, + * or a payload straddles multiple apertures. The S_LOAD instruction + * does not have this detection. + * + * Apertures cannot overlap. + * + * + * + * HSA32 - ATC/IOMMU 32b + * + * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR + * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b + * will not fit so there is only partial visibility to the GPUVM + * space (defined by the aperture) for S_LOAD and FLAT_* ops. + * There is no spare (APE1) aperture for HSA32 mode. + * + * + * GPUVM 64b mode (driver model) + * + * This mode is related to HSA64 in that the difference really is that + * the default aperture is GPUVM (ATC==0) and not ATC space. + * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for + * SUA GPUVM mode, but does not support HSA32/HSA64. + * + * + * Device Unified Address - DUA + * + * Device unified address (DUA) is the name of the feature that maps the + * Shared(LDS) memory and Private(Scratch) memory into the overall address + * space for use by the new FLAT_* vector memory ops. The Shared and + * Private memories are mapped as apertures into the address space, + * and the hardware detects when a FLAT_* memory request is to be redirected + * to the LDS or Scratch memory when it falls into one of these apertures. + * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and + * the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes, + * the Shared/Private apertures are always placed in a limited selection of + * options in the hole of the 64b address space. For HSA32 mode, the + * Shared/Private apertures can be placed anywhere in the 32b space + * except at 0. + * + * + * HSA64 Apertures for FLAT_* vector ops + * + * For HSA64 SUA mode, the Shared and Private apertures are always placed + * in the hole w/ a limited selection of possible locations. The requests + * that fall in the private aperture are expanded as a function of the + * work-item id (tid) and redirected to the location of the + * “hidden private memory”. The hidden private can be placed in either GPUVM + * or ATC space. The addresses that fall in the shared aperture are + * re-directed to the on-chip LDS memory hardware. + * + * + * HSA32 Apertures for FLAT_* vector ops + * + * In HSA32 mode, the Private and Shared apertures can be placed anywhere + * in the 32b space except at 0 (Private or Shared Base at zero disables + * the apertures). If the base address of the apertures are non-zero + * (ie apertures exists), the size is always 64KB. + * + * + * GPUVM Apertures for FLAT_* vector ops + * + * In GPUVM mode, the Shared/Private apertures are specified identically + * to HSA64 mode where they are always in the hole at a limited selection + * of locations. + * + * + * Aperture Definitions for SUA and DUA + * + * The interpretation of the aperture register definitions for a given + * VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or + * GPUVM64 discussed in previous sections. The mode is first decoded, and + * then the remaining register decode is a function of the mode. + * + * + * SUA Mode Decode + * + * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from + * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and + * the SH_MEM_CONFIG:PTR32 bits. + * + * COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode + * + * 1 0 HSA64 + * + * 1 1 HSA32 + * + * 0 X GPUVM64 + * + * In general the hardware will ignore the PTR32 bit and treat + * as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0 + * when DATA_ATC=0. + * + * The DATA_ATC bit is only set for compute dispatches. + * All “Draw” dispatches are hardcoded to GPUVM64 mode + * for FLAT_* / S_LOAD operations. + */ + +#define MAKE_GPUVM_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + +#define MAKE_GPUVM_APP_LIMIT(base) \ + (((uint64_t)(base) & \ + 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) + +#define MAKE_SCRATCH_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x100000000L) + +#define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +#define MAKE_LDS_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x0) +#define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +int kfd_init_apertures(struct kfd_process *process) +{ + uint8_t id = 0; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + + mutex_lock(&process->mutex); + + /*Iterating over all devices*/ + while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && + id < NUM_OF_SUPPORTED_GPUS) { + + pdd = kfd_get_process_device_data(dev, process, 1); + + /* + * For 64 bit process aperture will be statically reserved in + * the x86_64 non canonical process address space + * amdkfd doesn't currently support apertures for 32 bit process + */ + if (process->is_32bit_user_mode) { + pdd->lds_base = pdd->lds_limit = 0; + pdd->gpuvm_base = pdd->gpuvm_limit = 0; + pdd->scratch_base = pdd->scratch_limit = 0; + } else { + /* + * node id couldn't be 0 - the three MSB bits of + * aperture shoudn't be 0 + */ + pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); + + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); + + pdd->gpuvm_limit = + MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); + + pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); + + pdd->scratch_limit = + MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); + } + + dev_dbg(kfd_device, "node id %u\n", id); + dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit); + + id++; + } + + mutex_unlock(&process->mutex); + + return 0; +} + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c new file mode 100644 index 000000000000..5b999095a1f7 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -0,0 +1,176 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * KFD Interrupts. + * + * AMD GPUs deliver interrupts by pushing an interrupt description onto the + * interrupt ring and then sending an interrupt. KGD receives the interrupt + * in ISR and sends us a pointer to each new entry on the interrupt ring. + * + * We generally can't process interrupt-signaled events from ISR, so we call + * out to each interrupt client module (currently only the scheduler) to ask if + * each interrupt is interesting. If they return true, then it requires further + * processing so we copy it to an internal interrupt ring and call each + * interrupt client again from a work-queue. + * + * There's no acknowledgment for the interrupts we use. The hardware simply + * queues a new interrupt each time without waiting. + * + * The fixed-size internal queue means that it's possible for us to lose + * interrupts because we have no back-pressure to the hardware. + */ + +#include <linux/slab.h> +#include <linux/device.h> +#include "kfd_priv.h" + +#define KFD_INTERRUPT_RING_SIZE 256 + +static void interrupt_wq(struct work_struct *); + +int kfd_interrupt_init(struct kfd_dev *kfd) +{ + void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE, + kfd->device_info->ih_ring_entry_size, + GFP_KERNEL); + if (!interrupt_ring) + return -ENOMEM; + + kfd->interrupt_ring = interrupt_ring; + kfd->interrupt_ring_size = + KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size; + atomic_set(&kfd->interrupt_ring_wptr, 0); + atomic_set(&kfd->interrupt_ring_rptr, 0); + + spin_lock_init(&kfd->interrupt_lock); + + INIT_WORK(&kfd->interrupt_work, interrupt_wq); + + kfd->interrupts_active = true; + + /* + * After this function returns, the interrupt will be enabled. This + * barrier ensures that the interrupt running on a different processor + * sees all the above writes. + */ + smp_wmb(); + + return 0; +} + +void kfd_interrupt_exit(struct kfd_dev *kfd) +{ + /* + * Stop the interrupt handler from writing to the ring and scheduling + * workqueue items. The spinlock ensures that any interrupt running + * after we have unlocked sees interrupts_active = false. + */ + unsigned long flags; + + spin_lock_irqsave(&kfd->interrupt_lock, flags); + kfd->interrupts_active = false; + spin_unlock_irqrestore(&kfd->interrupt_lock, flags); + + /* + * Flush_scheduled_work ensures that there are no outstanding + * work-queue items that will access interrupt_ring. New work items + * can't be created because we stopped interrupt handling above. + */ + flush_scheduled_work(); + + kfree(kfd->interrupt_ring); +} + +/* + * This assumes that it can't be called concurrently with itself + * but only with dequeue_ih_ring_entry. + */ +bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) +{ + unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); + unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); + + if ((rptr - wptr) % kfd->interrupt_ring_size == + kfd->device_info->ih_ring_entry_size) { + /* This is very bad, the system is likely to hang. */ + dev_err_ratelimited(kfd_chardev(), + "Interrupt ring overflow, dropping interrupt.\n"); + return false; + } + + memcpy(kfd->interrupt_ring + wptr, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + + wptr = (wptr + kfd->device_info->ih_ring_entry_size) % + kfd->interrupt_ring_size; + smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */ + atomic_set(&kfd->interrupt_ring_wptr, wptr); + + return true; +} + +/* + * This assumes that it can't be called concurrently with itself + * but only with enqueue_ih_ring_entry. + */ +static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) +{ + /* + * Assume that wait queues have an implicit barrier, i.e. anything that + * happened in the ISR before it queued work is visible. + */ + + unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr); + unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr); + + if (rptr == wptr) + return false; + + memcpy(ih_ring_entry, kfd->interrupt_ring + rptr, + kfd->device_info->ih_ring_entry_size); + + rptr = (rptr + kfd->device_info->ih_ring_entry_size) % + kfd->interrupt_ring_size; + + /* + * Ensure the rptr write update is not visible until + * memcpy has finished reading. + */ + smp_mb(); + atomic_set(&kfd->interrupt_ring_rptr, rptr); + + return true; +} + +static void interrupt_wq(struct work_struct *work) +{ + struct kfd_dev *dev = container_of(work, struct kfd_dev, + interrupt_work); + + uint32_t ih_ring_entry[DIV_ROUND_UP( + dev->device_info->ih_ring_entry_size, + sizeof(uint32_t))]; + + while (dequeue_ih_ring_entry(dev, ih_ring_entry)) + ; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c new file mode 100644 index 000000000000..935071410724 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -0,0 +1,353 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_opcodes.h" + +#define PM4_COUNT_ZERO (((1 << 15) - 1) << 16) + +static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + struct queue_properties prop; + int retval; + union PM4_MES_TYPE_3_HEADER nop; + + BUG_ON(!kq || !dev); + BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ); + + pr_debug("kfd: In func %s initializing queue type %d size %d\n", + __func__, KFD_QUEUE_TYPE_HIQ, queue_size); + + nop.opcode = IT_NOP; + nop.type = PM4_TYPE_3; + nop.u32all |= PM4_COUNT_ZERO; + + kq->dev = dev; + kq->nop_packet = nop.u32all; + switch (type) { + case KFD_QUEUE_TYPE_DIQ: + case KFD_QUEUE_TYPE_HIQ: + kq->mqd = dev->dqm->get_mqd_manager(dev->dqm, + KFD_MQD_TYPE_CIK_HIQ); + break; + default: + BUG(); + break; + } + + if (kq->mqd == NULL) + return false; + + prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); + + if (prop.doorbell_ptr == NULL) + goto err_get_kernel_doorbell; + + retval = kfd2kgd->allocate_mem(dev->kgd, + queue_size, + PAGE_SIZE, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &kq->pq); + + if (retval != 0) + goto err_pq_allocate_vidmem; + + kq->pq_kernel_addr = kq->pq->cpu_ptr; + kq->pq_gpu_addr = kq->pq->gpu_addr; + + retval = kfd2kgd->allocate_mem(dev->kgd, + sizeof(*kq->rptr_kernel), + 32, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &kq->rptr_mem); + + if (retval != 0) + goto err_rptr_allocate_vidmem; + + kq->rptr_kernel = kq->rptr_mem->cpu_ptr; + kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; + + retval = kfd2kgd->allocate_mem(dev->kgd, + sizeof(*kq->wptr_kernel), + 32, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &kq->wptr_mem); + + if (retval != 0) + goto err_wptr_allocate_vidmem; + + kq->wptr_kernel = kq->wptr_mem->cpu_ptr; + kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr; + + memset(kq->pq_kernel_addr, 0, queue_size); + memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); + memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel)); + + prop.queue_size = queue_size; + prop.is_interop = false; + prop.priority = 1; + prop.queue_percent = 100; + prop.type = type; + prop.vmid = 0; + prop.queue_address = kq->pq_gpu_addr; + prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr; + prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; + + if (init_queue(&kq->queue, prop) != 0) + goto err_init_queue; + + kq->queue->device = dev; + kq->queue->process = kfd_get_process(current); + + retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd, + &kq->queue->mqd_mem_obj, + &kq->queue->gart_mqd_addr, + &kq->queue->properties); + if (retval != 0) + goto err_init_mqd; + + /* assign HIQ to HQD */ + if (type == KFD_QUEUE_TYPE_HIQ) { + pr_debug("assigning hiq to hqd\n"); + kq->queue->pipe = KFD_CIK_HIQ_PIPE; + kq->queue->queue = KFD_CIK_HIQ_QUEUE; + kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, + kq->queue->queue, NULL); + } else { + /* allocate fence for DIQ */ + + retval = kfd2kgd->allocate_mem(dev->kgd, + sizeof(uint32_t), + 32, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &kq->fence_mem_obj); + + if (retval != 0) + goto err_alloc_fence; + + kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr; + kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr; + } + + print_queue(kq->queue); + + return true; +err_alloc_fence: +err_init_mqd: + uninit_queue(kq->queue); +err_init_queue: + kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->wptr_mem); +err_wptr_allocate_vidmem: + kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->rptr_mem); +err_rptr_allocate_vidmem: + kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->pq); +err_pq_allocate_vidmem: + pr_err("kfd: error init pq\n"); + kfd_release_kernel_doorbell(dev, prop.doorbell_ptr); +err_get_kernel_doorbell: + pr_err("kfd: error init doorbell"); + return false; + +} + +static void uninitialize(struct kernel_queue *kq) +{ + BUG_ON(!kq); + + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) + kq->mqd->destroy_mqd(kq->mqd, + NULL, + false, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + kq->queue->pipe, + kq->queue->queue); + + kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->rptr_mem); + kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->wptr_mem); + kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->pq); + kfd_release_kernel_doorbell(kq->dev, + kq->queue->properties.doorbell_ptr); + uninit_queue(kq->queue); +} + +static int acquire_packet_buffer(struct kernel_queue *kq, + size_t packet_size_in_dwords, unsigned int **buffer_ptr) +{ + size_t available_size; + size_t queue_size_dwords; + uint32_t wptr, rptr; + unsigned int *queue_address; + + BUG_ON(!kq || !buffer_ptr); + + rptr = *kq->rptr_kernel; + wptr = *kq->wptr_kernel; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); + + pr_debug("kfd: In func %s\nrptr: %d\nwptr: %d\nqueue_address 0x%p\n", + __func__, rptr, wptr, queue_address); + + available_size = (rptr - 1 - wptr + queue_size_dwords) % + queue_size_dwords; + + if (packet_size_in_dwords >= queue_size_dwords || + packet_size_in_dwords >= available_size) { + /* + * make sure calling functions know + * acquire_packet_buffer() failed + */ + *buffer_ptr = NULL; + return -ENOMEM; + } + + if (wptr + packet_size_in_dwords >= queue_size_dwords) { + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; + } + } + + *buffer_ptr = &queue_address[wptr]; + kq->pending_wptr = wptr + packet_size_in_dwords; + + return 0; +} + +static void submit_packet(struct kernel_queue *kq) +{ +#ifdef DEBUG + int i; +#endif + + BUG_ON(!kq); + +#ifdef DEBUG + for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) { + pr_debug("0x%2X ", kq->pq_kernel_addr[i]); + if (i % 15 == 0) + pr_debug("\n"); + } + pr_debug("\n"); +#endif + + *kq->wptr_kernel = kq->pending_wptr; + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, + kq->pending_wptr); +} + +static int sync_with_hw(struct kernel_queue *kq, unsigned long timeout_ms) +{ + unsigned long org_timeout_ms; + + BUG_ON(!kq); + + org_timeout_ms = timeout_ms; + timeout_ms += jiffies * 1000 / HZ; + while (*kq->wptr_kernel != *kq->rptr_kernel) { + if (time_after(jiffies * 1000 / HZ, timeout_ms)) { + pr_err("kfd: kernel_queue %s timeout expired %lu\n", + __func__, org_timeout_ms); + pr_err("kfd: wptr: %d rptr: %d\n", + *kq->wptr_kernel, *kq->rptr_kernel); + return -ETIME; + } + schedule(); + } + + return 0; +} + +static void rollback_packet(struct kernel_queue *kq) +{ + BUG_ON(!kq); + kq->pending_wptr = *kq->queue->properties.write_ptr; +} + +struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type) +{ + struct kernel_queue *kq; + + BUG_ON(!dev); + + kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL); + if (!kq) + return NULL; + + kq->initialize = initialize; + kq->uninitialize = uninitialize; + kq->acquire_packet_buffer = acquire_packet_buffer; + kq->submit_packet = submit_packet; + kq->sync_with_hw = sync_with_hw; + kq->rollback_packet = rollback_packet; + + if (kq->initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE) == false) { + pr_err("kfd: failed to init kernel queue\n"); + kfree(kq); + return NULL; + } + return kq; +} + +void kernel_queue_uninit(struct kernel_queue *kq) +{ + BUG_ON(!kq); + + kq->uninitialize(kq); + kfree(kq); +} + +static __attribute__((unused)) void test_kq(struct kfd_dev *dev) +{ + struct kernel_queue *kq; + uint32_t *buffer, i; + int retval; + + BUG_ON(!dev); + + pr_debug("kfd: starting kernel queue test\n"); + + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); + BUG_ON(!kq); + + retval = kq->acquire_packet_buffer(kq, 5, &buffer); + BUG_ON(retval != 0); + for (i = 0; i < 5; i++) + buffer[i] = kq->nop_packet; + kq->submit_packet(kq); + kq->sync_with_hw(kq, 1000); + + pr_debug("kfd: ending kernel queue test\n"); +} + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h new file mode 100644 index 000000000000..dcd2bdb68d44 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -0,0 +1,69 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_KERNEL_QUEUE_H_ +#define KFD_KERNEL_QUEUE_H_ + +#include <linux/list.h> +#include <linux/types.h> +#include "kfd_priv.h" + +struct kernel_queue { + /* interface */ + bool (*initialize)(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + void (*uninitialize)(struct kernel_queue *kq); + int (*acquire_packet_buffer)(struct kernel_queue *kq, + size_t packet_size_in_dwords, + unsigned int **buffer_ptr); + + void (*submit_packet)(struct kernel_queue *kq); + int (*sync_with_hw)(struct kernel_queue *kq, + unsigned long timeout_ms); + void (*rollback_packet)(struct kernel_queue *kq); + + /* data */ + struct kfd_dev *dev; + struct mqd_manager *mqd; + struct queue *queue; + uint32_t pending_wptr; + unsigned int nop_packet; + + struct kfd_mem_obj *rptr_mem; + uint32_t *rptr_kernel; + uint64_t rptr_gpu_addr; + struct kfd_mem_obj *wptr_mem; + uint32_t *wptr_kernel; + uint64_t wptr_gpu_addr; + struct kfd_mem_obj *pq; + uint64_t pq_gpu_addr; + uint32_t *pq_kernel_addr; + + struct kfd_mem_obj *fence_mem_obj; + uint64_t fence_gpu_addr; + void *fence_kernel_address; + + struct list_head list; +}; + +#endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c new file mode 100644 index 000000000000..95d5af138e6e --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -0,0 +1,159 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <linux/device.h> +#include "kfd_priv.h" + +#define KFD_DRIVER_AUTHOR "AMD Inc. and others" + +#define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" +#define KFD_DRIVER_DATE "20141113" +#define KFD_DRIVER_MAJOR 0 +#define KFD_DRIVER_MINOR 7 +#define KFD_DRIVER_PATCHLEVEL 0 + +const struct kfd2kgd_calls *kfd2kgd; +static const struct kgd2kfd_calls kgd2kfd = { + .exit = kgd2kfd_exit, + .probe = kgd2kfd_probe, + .device_init = kgd2kfd_device_init, + .device_exit = kgd2kfd_device_exit, + .interrupt = kgd2kfd_interrupt, + .suspend = kgd2kfd_suspend, + .resume = kgd2kfd_resume, +}; + +int sched_policy = KFD_SCHED_POLICY_HWS; +module_param(sched_policy, int, 0444); +MODULE_PARM_DESC(sched_policy, + "Kernel cmdline parameter that defines the amdkfd scheduling policy"); + +int max_num_of_processes = KFD_MAX_NUM_OF_PROCESSES_DEFAULT; +module_param(max_num_of_processes, int, 0444); +MODULE_PARM_DESC(max_num_of_processes, + "Kernel cmdline parameter that defines the amdkfd maximum number of supported processes"); + +int max_num_of_queues_per_process = KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT; +module_param(max_num_of_queues_per_process, int, 0444); +MODULE_PARM_DESC(max_num_of_queues_per_process, + "Kernel cmdline parameter that defines the amdkfd maximum number of supported queues per process"); + +bool kgd2kfd_init(unsigned interface_version, + const struct kfd2kgd_calls *f2g, + const struct kgd2kfd_calls **g2f) +{ + /* + * Only one interface version is supported, + * no kfd/kgd version skew allowed. + */ + if (interface_version != KFD_INTERFACE_VERSION) + return false; + + /* Protection against multiple amd kgd loads */ + if (kfd2kgd) + return true; + + kfd2kgd = f2g; + *g2f = &kgd2kfd; + + return true; +} +EXPORT_SYMBOL(kgd2kfd_init); + +void kgd2kfd_exit(void) +{ +} + +static int __init kfd_module_init(void) +{ + int err; + + kfd2kgd = NULL; + + /* Verify module parameters */ + if ((sched_policy < KFD_SCHED_POLICY_HWS) || + (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { + pr_err("kfd: sched_policy has invalid value\n"); + return -1; + } + + /* Verify module parameters */ + if ((max_num_of_processes < 0) || + (max_num_of_processes > KFD_MAX_NUM_OF_PROCESSES)) { + pr_err("kfd: max_num_of_processes must be between 0 to KFD_MAX_NUM_OF_PROCESSES\n"); + return -1; + } + + if ((max_num_of_queues_per_process < 0) || + (max_num_of_queues_per_process > + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)) { + pr_err("kfd: max_num_of_queues_per_process must be between 0 to KFD_MAX_NUM_OF_QUEUES_PER_PROCESS\n"); + return -1; + } + + err = kfd_pasid_init(); + if (err < 0) + goto err_pasid; + + err = kfd_chardev_init(); + if (err < 0) + goto err_ioctl; + + err = kfd_topology_init(); + if (err < 0) + goto err_topology; + + kfd_process_create_wq(); + + dev_info(kfd_device, "Initialized module\n"); + + return 0; + +err_topology: + kfd_chardev_exit(); +err_ioctl: + kfd_pasid_exit(); +err_pasid: + return err; +} + +static void __exit kfd_module_exit(void) +{ + kfd_process_destroy_wq(); + kfd_topology_shutdown(); + kfd_chardev_exit(); + kfd_pasid_exit(); + dev_info(kfd_device, "Removed module\n"); +} + +module_init(kfd_module_init); +module_exit(kfd_module_exit); + +MODULE_AUTHOR(KFD_DRIVER_AUTHOR); +MODULE_DESCRIPTION(KFD_DRIVER_DESC); +MODULE_LICENSE("GPL and additional rights"); +MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." + __stringify(KFD_DRIVER_MINOR) "." + __stringify(KFD_DRIVER_PATCHLEVEL)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c new file mode 100644 index 000000000000..adc31474e786 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -0,0 +1,346 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "cik_regs.h" +#include "../../radeon/cik_reg.h" + +inline void busy_wait(unsigned long ms) +{ + while (time_before(jiffies, ms)) + cpu_relax(); +} + +static inline struct cik_mqd *get_mqd(void *mqd) +{ + return (struct cik_mqd *)mqd; +} + +static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct cik_mqd *m; + int retval; + + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + + retval = kfd2kgd->allocate_mem(mm->dev->kgd, + sizeof(struct cik_mqd), + 256, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + /* + * Make sure to use the last queue state saved on mqd when the cp + * reassigns the queue, so when queue is switched on/off (e.g over + * subscription or quantum timeout) the context will be consistent + */ + m->cp_hqd_persistent_state = + DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ; + + m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; + /* Although WinKFD writes this, I suspect it should not be necessary */ + m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; + + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + + /* + * Pipe Priority + * Identifies the pipe relative priority when this queue is connected + * to the pipeline. The pipe priority is against the GFX pipe and HP3D. + * In KFD we are using a fixed pipe priority set to CS_MEDIUM. + * 0 = CS_LOW (typically below GFX) + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + BUG_ON(!mm || !mqd); + kfd2kgd->free_mem(mm->dev->kgd, (struct kgd_mem *) mqd_mem_obj); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, + uint32_t queue_id, uint32_t __user *wptr) +{ + return kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, wptr); + +} + +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; + + /* + * Calculating queue size which is log base 2 of actual queue size -1 + * dwords and another -1 for ffs + */ + m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) + - 1 - 1; + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_iq_rptr = AQL_ENABLE; + m->cp_hqd_pq_control |= NO_UPDATE_RPTR; + } + + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0) { + m->cp_hqd_active = 1; + q->is_active = true; + } + + return 0; +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout, + pipe_id, queue_id); +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + + return kfd2kgd->hqd_is_occupies(mm->dev->kgd, queue_address, + pipe_id, queue_id); + +} + +/* + * HIQ MQD Implementation, concrete implementation for HIQ MQD implementation. + * The HIQ queue in Kaveri is using the same MQD structure as all the user mode + * queues but with different initial values. + */ + +static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct cik_mqd *m; + int retval; + + BUG_ON(!mm || !q || !mqd || !mqd_mem_obj); + + pr_debug("kfd: In func %s\n", __func__); + + retval = kfd2kgd->allocate_mem(mm->dev->kgd, + sizeof(struct cik_mqd), + 256, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE | + PRELOAD_REQ; + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + + m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; + + /* + * Pipe Priority + * Identifies the pipe relative priority when this queue is connected + * to the pipeline. The pipe priority is against the GFX pipe and HP3D. + * In KFD we are using a fixed pipe priority set to CS_MEDIUM. + * 0 = CS_LOW (typically below GFX) + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE | + PRIV_STATE | + KMD_QUEUE; + + /* + * Calculating queue size which is log base 2 of actual queue + * size -1 dwords + */ + m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) + - 1 - 1; + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0) { + m->cp_hqd_active = 1; + q->is_active = true; + } + + return 0; +} + +struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + BUG_ON(!dev); + BUG_ON(type >= KFD_MQD_TYPE_MAX); + + pr_debug("kfd: In func %s\n", __func__); + + mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CIK_CP: + case KFD_MQD_TYPE_CIK_COMPUTE: + mqd->init_mqd = init_mqd; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + break; + case KFD_MQD_TYPE_CIK_HIQ: + mqd->init_mqd = init_mqd_hiq; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} + +/* SDMA queues should be implemented here when the cp will supports them */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h new file mode 100644 index 000000000000..213a71e0b6c7 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -0,0 +1,91 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_MQD_MANAGER_H_ +#define KFD_MQD_MANAGER_H_ + +#include "kfd_priv.h" + +/** + * struct mqd_manager + * + * @init_mqd: Allocates the mqd buffer on local gpu memory and initialize it. + * + * @load_mqd: Loads the mqd to a concrete hqd slot. Used only for no cp + * scheduling mode. + * + * @update_mqd: Handles a update call for the MQD + * + * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue. + * Used only for no cp scheduling. + * + * @uninit_mqd: Releases the mqd buffer from local gpu memory. + * + * @is_occupied: Checks if the relevant HQD slot is occupied. + * + * @mqd_mutex: Mqd manager mutex. + * + * @dev: The kfd device structure coupled with this module. + * + * MQD stands for Memory Queue Descriptor which represents the current queue + * state in the memory and initiate the HQD (Hardware Queue Descriptor) state. + * This structure is actually a base class for the different types of MQDs + * structures for the variant ASICs that should be supported in the future. + * This base class is also contains all the MQD specific operations. + * Another important thing to mention is that each queue has a MQD that keeps + * his state (or context) after each preemption or reassignment. + * Basically there are a instances of the mqd manager class per MQD type per + * ASIC. Currently the kfd driver supports only Kaveri so there are instances + * per KFD_MQD_TYPE for each device. + * + */ + +struct mqd_manager { + int (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q); + + int (*load_mqd)(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t __user *wptr); + + int (*update_mqd)(struct mqd_manager *mm, void *mqd, + struct queue_properties *q); + + int (*destroy_mqd)(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); + + void (*uninit_mqd)(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj); + + bool (*is_occupied)(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id); + + struct mutex mqd_mutex; + struct kfd_dev *dev; +}; + +#endif /* KFD_MQD_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c new file mode 100644 index 000000000000..5ce9233d2004 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -0,0 +1,565 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include <linux/mutex.h> +#include "kfd_device_queue_manager.h" +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_opcodes.h" + +static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, + unsigned int buffer_size_bytes) +{ + unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t); + + BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes); + *wptr = temp; +} + +static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) +{ + union PM4_MES_TYPE_3_HEADER header; + + header.u32all = 0; + header.opcode = opcode; + header.count = packet_size/sizeof(uint32_t) - 2; + header.type = PM4_TYPE_3; + + return header.u32all; +} + +static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) +{ + unsigned int process_count, queue_count; + + BUG_ON(!pm || !rlib_size || !over_subscription); + + process_count = pm->dqm->processes_count; + queue_count = pm->dqm->queue_count; + + /* check if there is over subscription*/ + *over_subscription = false; + if ((process_count > 1) || + queue_count > PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE) { + *over_subscription = true; + pr_debug("kfd: over subscribed runlist\n"); + } + + /* calculate run list ib allocation size */ + *rlib_size = process_count * sizeof(struct pm4_map_process) + + queue_count * sizeof(struct pm4_map_queues); + + /* + * Increase the allocation size in case we need a chained run list + * when over subscription + */ + if (*over_subscription) + *rlib_size += sizeof(struct pm4_runlist); + + pr_debug("kfd: runlist ib size %d\n", *rlib_size); +} + +static int pm_allocate_runlist_ib(struct packet_manager *pm, + unsigned int **rl_buffer, + uint64_t *rl_gpu_buffer, + unsigned int *rl_buffer_size, + bool *is_over_subscription) +{ + int retval; + + BUG_ON(!pm); + BUG_ON(pm->allocated == true); + BUG_ON(is_over_subscription == NULL); + + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + + retval = kfd2kgd->allocate_mem(pm->dqm->dev->kgd, + *rl_buffer_size, + PAGE_SIZE, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, + (struct kgd_mem **) &pm->ib_buffer_obj); + + if (retval != 0) { + pr_err("kfd: failed to allocate runlist IB\n"); + return retval; + } + + *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; + *rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr; + + memset(*rl_buffer, 0, *rl_buffer_size); + pm->allocated = true; + return retval; +} + +static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_runlist *packet; + + BUG_ON(!pm || !buffer || !ib); + + packet = (struct pm4_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_runlist)); + packet->header.u32all = build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->ordinal2 = lower_32_bits(ib); + packet->bitfields3.ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd) +{ + struct pm4_map_process *packet; + struct queue *cur; + uint32_t num_queues; + + BUG_ON(!pm || !buffer || !qpd); + + packet = (struct pm4_map_process *)buffer; + + pr_debug("kfd: In func %s\n", __func__); + + memset(buffer, 0, sizeof(struct pm4_map_process)); + + packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + num_queues = 0; + list_for_each_entry(cur, &qpd->queues_list, list) + num_queues++; + packet->bitfields10.num_queues = num_queues; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, + struct queue *q) +{ + struct pm4_map_queues *packet; + + BUG_ON(!pm || !buffer || !q); + + pr_debug("kfd: In func %s\n", __func__); + + packet = (struct pm4_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_map_queues)); + + packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; + + packet->bitfields2.vidmem = (q->properties.is_interop) ? + vidmem__mes_map_queues__uses_video_memory : + vidmem__mes_map_queues__uses_no_video_memory; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__sdma0; + break; + default: + BUG(); + break; + } + + packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mes_map_queues_ordinals[0].mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mes_map_queues_ordinals[0].mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->mes_map_queues_ordinals[0].wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->mes_map_queues_ordinals[0].wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_create_runlist_ib(struct packet_manager *pm, + struct list_head *queues, + uint64_t *rl_gpu_addr, + size_t *rl_size_bytes) +{ + unsigned int alloc_size_bytes; + unsigned int *rl_buffer, rl_wptr, i; + int retval, proccesses_mapped; + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + struct kernel_queue *kq; + bool is_over_subscription; + + BUG_ON(!pm || !queues || !rl_size_bytes || !rl_gpu_addr); + + rl_wptr = retval = proccesses_mapped = 0; + + retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, + &alloc_size_bytes, &is_over_subscription); + if (retval != 0) + return retval; + + *rl_size_bytes = alloc_size_bytes; + + pr_debug("kfd: In func %s\n", __func__); + pr_debug("kfd: building runlist ib process count: %d queues count %d\n", + pm->dqm->processes_count, pm->dqm->queue_count); + + /* build the run list ib packet */ + list_for_each_entry(cur, queues, list) { + qpd = cur->qpd; + /* build map process packet */ + if (proccesses_mapped >= pm->dqm->processes_count) { + pr_debug("kfd: not enough space left in runlist IB\n"); + pm_release_ib(pm); + return -ENOMEM; + } + retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); + if (retval != 0) + return retval; + proccesses_mapped++; + inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), + alloc_size_bytes); + + list_for_each_entry(kq, &qpd->priv_queue_list, list) { + if (kq->queue->properties.is_active != true) + continue; + retval = pm_create_map_queue(pm, &rl_buffer[rl_wptr], + kq->queue); + if (retval != 0) + return retval; + inc_wptr(&rl_wptr, sizeof(struct pm4_map_queues), + alloc_size_bytes); + } + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.is_active != true) + continue; + retval = pm_create_map_queue(pm, + &rl_buffer[rl_wptr], q); + if (retval != 0) + return retval; + inc_wptr(&rl_wptr, sizeof(struct pm4_map_queues), + alloc_size_bytes); + } + } + + pr_debug("kfd: finished map process and queues to runlist\n"); + + if (is_over_subscription) + pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, + alloc_size_bytes / sizeof(uint32_t), true); + + for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) + pr_debug("0x%2X ", rl_buffer[i]); + pr_debug("\n"); + + return 0; +} + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) +{ + BUG_ON(!dqm); + + pm->dqm = dqm; + mutex_init(&pm->lock); + pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); + if (pm->priv_queue == NULL) { + mutex_destroy(&pm->lock); + return -ENOMEM; + } + pm->allocated = false; + + return 0; +} + +void pm_uninit(struct packet_manager *pm) +{ + BUG_ON(!pm); + + mutex_destroy(&pm->lock); + kernel_queue_uninit(pm->priv_queue); +} + +int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res) +{ + struct pm4_set_resources *packet; + + BUG_ON(!pm || !res); + + pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&pm->lock); + pm->priv_queue->acquire_packet_buffer(pm->priv_queue, + sizeof(*packet) / sizeof(uint32_t), + (unsigned int **)&packet); + if (packet == NULL) { + mutex_unlock(&pm->lock); + pr_err("kfd: failed to allocate buffer on kernel queue\n"); + return -ENOMEM; + } + + memset(packet, 0, sizeof(struct pm4_set_resources)); + packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + pm->priv_queue->submit_packet(pm->priv_queue); + pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT); + + mutex_unlock(&pm->lock); + + return 0; +} + +int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) +{ + uint64_t rl_gpu_ib_addr; + uint32_t *rl_buffer; + size_t rl_ib_size, packet_size_dwords; + int retval; + + BUG_ON(!pm || !dqm_queues); + + retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr, + &rl_ib_size); + if (retval != 0) + goto fail_create_runlist_ib; + + pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + + packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); + mutex_lock(&pm->lock); + + retval = pm->priv_queue->acquire_packet_buffer(pm->priv_queue, + packet_size_dwords, &rl_buffer); + if (retval != 0) + goto fail_acquire_packet_buffer; + + retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, + rl_ib_size / sizeof(uint32_t), false); + if (retval != 0) + goto fail_create_runlist; + + pm->priv_queue->submit_packet(pm->priv_queue); + pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT); + + mutex_unlock(&pm->lock); + + return retval; + +fail_create_runlist: + pm->priv_queue->rollback_packet(pm->priv_queue); +fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); +fail_create_runlist_ib: + if (pm->allocated == true) + pm_release_ib(pm); + return retval; +} + +int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value) +{ + int retval; + struct pm4_query_status *packet; + + BUG_ON(!pm || !fence_address); + + mutex_lock(&pm->lock); + retval = pm->priv_queue->acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_query_status) / sizeof(uint32_t), + (unsigned int **)&packet); + if (retval != 0) + goto fail_acquire_packet_buffer; + + packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + pm->priv_queue->submit_packet(pm->priv_queue); + pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT); + mutex_unlock(&pm->lock); + + return 0; + +fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); + return retval; +} + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + enum kfd_preempt_type_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + int retval; + uint32_t *buffer; + struct pm4_unmap_queues *packet; + + BUG_ON(!pm); + + mutex_lock(&pm->lock); + retval = pm->priv_queue->acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), + &buffer); + if (retval != 0) + goto err_acquire_packet_buffer; + + packet = (struct pm4_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_unmap_queues)); + + packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + BUG(); + break; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (mode) { + case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; + break; + default: + BUG(); + break; + }; + + pm->priv_queue->submit_packet(pm->priv_queue); + pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT); + + mutex_unlock(&pm->lock); + return 0; + +err_acquire_packet_buffer: + mutex_unlock(&pm->lock); + return retval; +} + +void pm_release_ib(struct packet_manager *pm) +{ + BUG_ON(!pm); + + mutex_lock(&pm->lock); + if (pm->allocated) { + kfd2kgd->free_mem(pm->dqm->dev->kgd, + (struct kgd_mem *) pm->ib_buffer_obj); + pm->allocated = false; + } + mutex_unlock(&pm->lock); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c new file mode 100644 index 000000000000..71699ad97d74 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -0,0 +1,96 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/slab.h> +#include <linux/types.h> +#include "kfd_priv.h" + +static unsigned long *pasid_bitmap; +static unsigned int pasid_limit; +static DEFINE_MUTEX(pasid_mutex); + +int kfd_pasid_init(void) +{ + pasid_limit = max_num_of_processes; + + pasid_bitmap = kzalloc(BITS_TO_LONGS(pasid_limit), GFP_KERNEL); + if (!pasid_bitmap) + return -ENOMEM; + + set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */ + + return 0; +} + +void kfd_pasid_exit(void) +{ + kfree(pasid_bitmap); +} + +bool kfd_set_pasid_limit(unsigned int new_limit) +{ + if (new_limit < pasid_limit) { + bool ok; + + mutex_lock(&pasid_mutex); + + /* ensure that no pasids >= new_limit are in-use */ + ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) == + pasid_limit); + if (ok) + pasid_limit = new_limit; + + mutex_unlock(&pasid_mutex); + + return ok; + } + + return true; +} + +inline unsigned int kfd_get_pasid_limit(void) +{ + return pasid_limit; +} + +unsigned int kfd_pasid_alloc(void) +{ + unsigned int found; + + mutex_lock(&pasid_mutex); + + found = find_first_zero_bit(pasid_bitmap, pasid_limit); + if (found == pasid_limit) + found = 0; + else + set_bit(found, pasid_bitmap); + + mutex_unlock(&pasid_mutex); + + return found; +} + +void kfd_pasid_free(unsigned int pasid) +{ + BUG_ON(pasid == 0 || pasid >= pasid_limit); + clear_bit(pasid, pasid_bitmap); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h new file mode 100644 index 000000000000..071ad5724bd2 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h @@ -0,0 +1,405 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_PM4_HEADERS_H_ +#define KFD_PM4_HEADERS_H_ + +#ifndef PM4_MES_HEADER_DEFINED +#define PM4_MES_HEADER_DEFINED +union PM4_MES_TYPE_3_HEADER { + struct { + uint32_t reserved1:8; /* < reserved */ + uint32_t opcode:8; /* < IT opcode */ + uint32_t count:14; /* < number of DWORDs - 1 + * in the information body. + */ + uint32_t type:2; /* < packet identifier. + * It should be 3 for type 3 packets + */ + }; + uint32_t u32all; +}; +#endif /* PM4_MES_HEADER_DEFINED */ + +/* --------------------MES_SET_RESOURCES-------------------- */ + +#ifndef PM4_MES_SET_RESOURCES_DEFINED +#define PM4_MES_SET_RESOURCES_DEFINED +enum set_resources_queue_type_enum { + queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, + queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, + queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +}; + +struct pm4_set_resources { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t vmid_mask:16; + uint32_t unmap_latency:8; + uint32_t reserved1:5; + enum set_resources_queue_type_enum queue_type:3; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t queue_mask_lo; + uint32_t queue_mask_hi; + uint32_t gws_mask_lo; + uint32_t gws_mask_hi; + + union { + struct { + uint32_t oac_mask:16; + uint32_t reserved2:16; + } bitfields7; + uint32_t ordinal7; + }; + + union { + struct { + uint32_t gds_heap_base:6; + uint32_t reserved3:5; + uint32_t gds_heap_size:6; + uint32_t reserved4:15; + } bitfields8; + uint32_t ordinal8; + }; + +}; +#endif + +/*--------------------MES_RUN_LIST-------------------- */ + +#ifndef PM4_MES_RUN_LIST_DEFINED +#define PM4_MES_RUN_LIST_DEFINED + +struct pm4_runlist { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:2; + uint32_t ib_base_lo:30; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t ib_base_hi:16; + uint32_t reserved2:16; + } bitfields3; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; + uint32_t reserved3:1; + uint32_t valid:1; + uint32_t reserved4:8; + } bitfields4; + uint32_t ordinal4; + }; + +}; +#endif + +/*--------------------MES_MAP_PROCESS-------------------- */ + +#ifndef PM4_MES_MAP_PROCESS_DEFINED +#define PM4_MES_MAP_PROCESS_DEFINED + +struct pm4_map_process { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved3:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t sh_mem_bases; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t sh_mem_config; + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved4:2; + uint32_t num_oac:4; + uint32_t reserved5:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + +}; +#endif + +/*--------------------MES_MAP_QUEUES--------------------*/ + +#ifndef PM4_MES_MAP_QUEUES_DEFINED +#define PM4_MES_MAP_QUEUES_DEFINED +enum map_queues_queue_sel_enum { + queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, + queue_sel__mes_map_queues__enable_process_queues = 2 +}; + +enum map_queues_vidmem_enum { + vidmem__mes_map_queues__uses_no_video_memory = 0, + vidmem__mes_map_queues__uses_video_memory = 1 +}; + +enum map_queues_alloc_format_enum { + alloc_format__mes_map_queues__one_per_pipe = 0, + alloc_format__mes_map_queues__all_on_one_pipe = 1 +}; + +enum map_queues_engine_sel_enum { + engine_sel__mes_map_queues__compute = 0, + engine_sel__mes_map_queues__sdma0 = 2, + engine_sel__mes_map_queues__sdma1 = 3 +}; + +struct pm4_map_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:4; + enum map_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:2; + uint32_t vmid:4; + uint32_t reserved3:4; + enum map_queues_vidmem_enum vidmem:2; + uint32_t reserved4:6; + enum map_queues_alloc_format_enum alloc_format:2; + enum map_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + struct { + union { + struct { + uint32_t reserved5:2; + uint32_t doorbell_offset:21; + uint32_t reserved6:3; + uint32_t queue:6; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t mqd_addr_lo; + uint32_t mqd_addr_hi; + uint32_t wptr_addr_lo; + uint32_t wptr_addr_hi; + + } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ + +}; +#endif + +/*--------------------MES_QUERY_STATUS--------------------*/ + +#ifndef PM4_MES_QUERY_STATUS_DEFINED +#define PM4_MES_QUERY_STATUS_DEFINED +enum query_status_interrupt_sel_enum { + interrupt_sel__mes_query_status__completion_status = 0, + interrupt_sel__mes_query_status__process_status = 1, + interrupt_sel__mes_query_status__queue_status = 2 +}; + +enum query_status_command_enum { + command__mes_query_status__interrupt_only = 0, + command__mes_query_status__fence_only_immediate = 1, + command__mes_query_status__fence_only_after_write_ack = 2, + command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +}; + +enum query_status_engine_sel_enum { + engine_sel__mes_query_status__compute = 0, + engine_sel__mes_query_status__sdma0_queue = 2, + engine_sel__mes_query_status__sdma1_queue = 3 +}; + +struct pm4_query_status { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t context_id:28; + enum query_status_interrupt_sel_enum interrupt_sel:2; + enum query_status_command_enum command:2; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:16; + } bitfields3a; + struct { + uint32_t reserved2:2; + uint32_t doorbell_offset:21; + uint32_t reserved3:3; + enum query_status_engine_sel_enum engine_sel:3; + uint32_t reserved4:3; + } bitfields3b; + uint32_t ordinal3; + }; + + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t data_lo; + uint32_t data_hi; +}; +#endif + +/*--------------------MES_UNMAP_QUEUES--------------------*/ + +#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +#define PM4_MES_UNMAP_QUEUES_DEFINED +enum unmap_queues_action_enum { + action__mes_unmap_queues__preempt_queues = 0, + action__mes_unmap_queues__reset_queues = 1, + action__mes_unmap_queues__disable_process_queues = 2 +}; + +enum unmap_queues_queue_sel_enum { + queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2 +}; + +enum unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__compute = 0, + engine_sel__mes_unmap_queues__sdma0 = 2, + engine_sel__mes_unmap_queues__sdma1 = 3 +}; + +struct pm4_unmap_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + enum unmap_queues_action_enum action:2; + uint32_t reserved1:2; + enum unmap_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:20; + enum unmap_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved3:16; + } bitfields3a; + struct { + uint32_t reserved4:2; + uint32_t doorbell_offset0:21; + uint32_t reserved5:9; + } bitfields3b; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t reserved6:2; + uint32_t doorbell_offset1:21; + uint32_t reserved7:9; + } bitfields4; + uint32_t ordinal4; + }; + + union { + struct { + uint32_t reserved8:2; + uint32_t doorbell_offset2:21; + uint32_t reserved9:9; + } bitfields5; + uint32_t ordinal5; + }; + + union { + struct { + uint32_t reserved10:2; + uint32_t doorbell_offset3:21; + uint32_t reserved11:9; + } bitfields6; + uint32_t ordinal6; + }; + +}; +#endif + +enum { + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +}; + +#endif /* KFD_PM4_HEADERS_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h new file mode 100644 index 000000000000..b72fa3b8c2d4 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h @@ -0,0 +1,107 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + + +#ifndef KFD_PM4_OPCODES_H +#define KFD_PM4_OPCODES_H + +enum it_opcode_type { + IT_NOP = 0x10, + IT_SET_BASE = 0x11, + IT_CLEAR_STATE = 0x12, + IT_INDEX_BUFFER_SIZE = 0x13, + IT_DISPATCH_DIRECT = 0x15, + IT_DISPATCH_INDIRECT = 0x16, + IT_ATOMIC_GDS = 0x1D, + IT_OCCLUSION_QUERY = 0x1F, + IT_SET_PREDICATION = 0x20, + IT_REG_RMW = 0x21, + IT_COND_EXEC = 0x22, + IT_PRED_EXEC = 0x23, + IT_DRAW_INDIRECT = 0x24, + IT_DRAW_INDEX_INDIRECT = 0x25, + IT_INDEX_BASE = 0x26, + IT_DRAW_INDEX_2 = 0x27, + IT_CONTEXT_CONTROL = 0x28, + IT_INDEX_TYPE = 0x2A, + IT_DRAW_INDIRECT_MULTI = 0x2C, + IT_DRAW_INDEX_AUTO = 0x2D, + IT_NUM_INSTANCES = 0x2F, + IT_DRAW_INDEX_MULTI_AUTO = 0x30, + IT_INDIRECT_BUFFER_CNST = 0x33, + IT_STRMOUT_BUFFER_UPDATE = 0x34, + IT_DRAW_INDEX_OFFSET_2 = 0x35, + IT_DRAW_PREAMBLE = 0x36, + IT_WRITE_DATA = 0x37, + IT_DRAW_INDEX_INDIRECT_MULTI = 0x38, + IT_MEM_SEMAPHORE = 0x39, + IT_COPY_DW = 0x3B, + IT_WAIT_REG_MEM = 0x3C, + IT_INDIRECT_BUFFER = 0x3F, + IT_COPY_DATA = 0x40, + IT_PFP_SYNC_ME = 0x42, + IT_SURFACE_SYNC = 0x43, + IT_COND_WRITE = 0x45, + IT_EVENT_WRITE = 0x46, + IT_EVENT_WRITE_EOP = 0x47, + IT_EVENT_WRITE_EOS = 0x48, + IT_RELEASE_MEM = 0x49, + IT_PREAMBLE_CNTL = 0x4A, + IT_DMA_DATA = 0x50, + IT_ACQUIRE_MEM = 0x58, + IT_REWIND = 0x59, + IT_LOAD_UCONFIG_REG = 0x5E, + IT_LOAD_SH_REG = 0x5F, + IT_LOAD_CONFIG_REG = 0x60, + IT_LOAD_CONTEXT_REG = 0x61, + IT_SET_CONFIG_REG = 0x68, + IT_SET_CONTEXT_REG = 0x69, + IT_SET_CONTEXT_REG_INDIRECT = 0x73, + IT_SET_SH_REG = 0x76, + IT_SET_SH_REG_OFFSET = 0x77, + IT_SET_QUEUE_REG = 0x78, + IT_SET_UCONFIG_REG = 0x79, + IT_SCRATCH_RAM_WRITE = 0x7D, + IT_SCRATCH_RAM_READ = 0x7E, + IT_LOAD_CONST_RAM = 0x80, + IT_WRITE_CONST_RAM = 0x81, + IT_DUMP_CONST_RAM = 0x83, + IT_INCREMENT_CE_COUNTER = 0x84, + IT_INCREMENT_DE_COUNTER = 0x85, + IT_WAIT_ON_CE_COUNTER = 0x86, + IT_WAIT_ON_DE_COUNTER_DIFF = 0x88, + IT_SWITCH_BUFFER = 0x8B, + IT_SET_RESOURCES = 0xA0, + IT_MAP_PROCESS = 0xA1, + IT_MAP_QUEUES = 0xA2, + IT_UNMAP_QUEUES = 0xA3, + IT_QUERY_STATUS = 0xA4, + IT_RUN_LIST = 0xA5, +}; + +#define PM4_TYPE_0 0 +#define PM4_TYPE_2 2 +#define PM4_TYPE_3 3 + +#endif /* KFD_PM4_OPCODES_H */ + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h new file mode 100644 index 000000000000..f9fb81e3bb09 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -0,0 +1,600 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_PRIV_H_INCLUDED +#define KFD_PRIV_H_INCLUDED + +#include <linux/hashtable.h> +#include <linux/mmu_notifier.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/kfd_ioctl.h> +#include <kgd_kfd_interface.h> + +#define KFD_SYSFS_FILE_MODE 0444 + +/* + * When working with cp scheduler we should assign the HIQ manually or via + * the radeon driver to a fixed hqd slot, here are the fixed HIQ hqd slot + * definitions for Kaveri. In Kaveri only the first ME queues participates + * in the cp scheduling taking that in mind we set the HIQ slot in the + * second ME. + */ +#define KFD_CIK_HIQ_PIPE 4 +#define KFD_CIK_HIQ_QUEUE 0 + +/* GPU ID hash width in bits */ +#define KFD_GPU_ID_HASH_WIDTH 16 + +/* Macro for allocating structures */ +#define kfd_alloc_struct(ptr_to_struct) \ + ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) + +/* Kernel module parameter to specify maximum number of supported processes */ +extern int max_num_of_processes; + +#define KFD_MAX_NUM_OF_PROCESSES_DEFAULT 32 +#define KFD_MAX_NUM_OF_PROCESSES 512 + +/* + * Kernel module parameter to specify maximum number of supported queues + * per process + */ +extern int max_num_of_queues_per_process; + +#define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT 128 +#define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 + +#define KFD_KERNEL_QUEUE_SIZE 2048 + +/* Kernel module parameter to specify the scheduling policy */ +extern int sched_policy; + +/** + * enum kfd_sched_policy + * + * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp) + * scheduling. In this scheduling mode we're using the firmware code to + * schedule the user mode queues and kernel queues such as HIQ and DIQ. + * the HIQ queue is used as a special queue that dispatches the configuration + * to the cp and the user mode queues list that are currently running. + * the DIQ queue is a debugging queue that dispatches debugging commands to the + * firmware. + * in this scheduling mode user mode queues over subscription feature is + * enabled. + * + * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over + * subscription feature disabled. + * + * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly + * set the command processor registers and sets the queues "manually". This + * mode is used *ONLY* for debugging proposes. + * + */ +enum kfd_sched_policy { + KFD_SCHED_POLICY_HWS = 0, + KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION, + KFD_SCHED_POLICY_NO_HWS +}; + +enum cache_policy { + cache_policy_coherent, + cache_policy_noncoherent +}; + +struct kfd_device_info { + unsigned int max_pasid_bits; + size_t ih_ring_entry_size; + uint16_t mqd_size_aligned; +}; + +struct kfd_dev { + struct kgd_dev *kgd; + + const struct kfd_device_info *device_info; + struct pci_dev *pdev; + + unsigned int id; /* topology stub index */ + + phys_addr_t doorbell_base; /* Start of actual doorbells used by + * KFD. It is aligned for mapping + * into user mode + */ + size_t doorbell_id_offset; /* Doorbell offset (from KFD doorbell + * to HW doorbell, GFX reserved some + * at the start) + */ + size_t doorbell_process_limit; /* Number of processes we have doorbell + * space for. + */ + u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells + * page used by kernel queue + */ + + struct kgd2kfd_shared_resources shared_resources; + + void *interrupt_ring; + size_t interrupt_ring_size; + atomic_t interrupt_ring_rptr; + atomic_t interrupt_ring_wptr; + struct work_struct interrupt_work; + spinlock_t interrupt_lock; + + /* QCM Device instance */ + struct device_queue_manager *dqm; + + bool init_complete; + /* + * Interrupts of interest to KFD are copied + * from the HW ring into a SW ring. + */ + bool interrupts_active; +}; + +/* KGD2KFD callbacks */ +void kgd2kfd_exit(void); +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev); +bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources); +void kgd2kfd_device_exit(struct kfd_dev *kfd); + +extern const struct kfd2kgd_calls *kfd2kgd; + +struct kfd_mem_obj { + void *bo; + uint64_t gpu_addr; + uint32_t *cpu_ptr; +}; + +enum kfd_mempool { + KFD_MEMPOOL_SYSTEM_CACHEABLE = 1, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2, + KFD_MEMPOOL_FRAMEBUFFER = 3, +}; + +/* Character device interface */ +int kfd_chardev_init(void); +void kfd_chardev_exit(void); +struct device *kfd_chardev(void); + +/** + * enum kfd_preempt_type_filter + * + * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. + * + * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the + * running queues list. + * + * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to + * specific process. + * + */ +enum kfd_preempt_type_filter { + KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, + KFD_PREEMPT_TYPE_FILTER_BY_PASID +}; + +enum kfd_preempt_type { + KFD_PREEMPT_TYPE_WAVEFRONT, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET +}; + +/** + * enum kfd_queue_type + * + * @KFD_QUEUE_TYPE_COMPUTE: Regular user mode queue type. + * + * @KFD_QUEUE_TYPE_SDMA: Sdma user mode queue type. + * + * @KFD_QUEUE_TYPE_HIQ: HIQ queue type. + * + * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. + */ +enum kfd_queue_type { + KFD_QUEUE_TYPE_COMPUTE, + KFD_QUEUE_TYPE_SDMA, + KFD_QUEUE_TYPE_HIQ, + KFD_QUEUE_TYPE_DIQ +}; + +enum kfd_queue_format { + KFD_QUEUE_FORMAT_PM4, + KFD_QUEUE_FORMAT_AQL +}; + +/** + * struct queue_properties + * + * @type: The queue type. + * + * @queue_id: Queue identifier. + * + * @queue_address: Queue ring buffer address. + * + * @queue_size: Queue ring buffer size. + * + * @priority: Defines the queue priority relative to other queues in the + * process. + * This is just an indication and HW scheduling may override the priority as + * necessary while keeping the relative prioritization. + * the priority granularity is from 0 to f which f is the highest priority. + * currently all queues are initialized with the highest priority. + * + * @queue_percent: This field is partially implemented and currently a zero in + * this field defines that the queue is non active. + * + * @read_ptr: User space address which points to the number of dwords the + * cp read from the ring buffer. This field updates automatically by the H/W. + * + * @write_ptr: Defines the number of dwords written to the ring buffer. + * + * @doorbell_ptr: This field aim is to notify the H/W of new packet written to + * the queue ring buffer. This field should be similar to write_ptr and the user + * should update this field after he updated the write_ptr. + * + * @doorbell_off: The doorbell offset in the doorbell pci-bar. + * + * @is_interop: Defines if this is a interop queue. Interop queue means that the + * queue can access both graphics and compute resources. + * + * @is_active: Defines if the queue is active or not. + * + * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid + * of the queue. + * + * This structure represents the queue properties for each queue no matter if + * it's user mode or kernel mode queue. + * + */ +struct queue_properties { + enum kfd_queue_type type; + enum kfd_queue_format format; + unsigned int queue_id; + uint64_t queue_address; + uint64_t queue_size; + uint32_t priority; + uint32_t queue_percent; + uint32_t *read_ptr; + uint32_t *write_ptr; + uint32_t __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; + bool is_active; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; +}; + +/** + * struct queue + * + * @list: Queue linked list. + * + * @mqd: The queue MQD. + * + * @mqd_mem_obj: The MQD local gpu memory object. + * + * @gart_mqd_addr: The MQD gart mc address. + * + * @properties: The queue properties. + * + * @mec: Used only in no cp scheduling mode and identifies to micro engine id + * that the queue should be execute on. + * + * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe id. + * + * @queue: Used only in no cp scheduliong mode and identifies the queue's slot. + * + * @process: The kfd process that created this queue. + * + * @device: The kfd device that created this queue. + * + * This structure represents user mode compute queues. + * It contains all the necessary data to handle such queues. + * + */ + +struct queue { + struct list_head list; + void *mqd; + struct kfd_mem_obj *mqd_mem_obj; + uint64_t gart_mqd_addr; + struct queue_properties properties; + + uint32_t mec; + uint32_t pipe; + uint32_t queue; + + struct kfd_process *process; + struct kfd_dev *device; +}; + +/* + * Please read the kfd_mqd_manager.h description. + */ +enum KFD_MQD_TYPE { + KFD_MQD_TYPE_CIK_COMPUTE = 0, /* for no cp scheduling */ + KFD_MQD_TYPE_CIK_HIQ, /* for hiq */ + KFD_MQD_TYPE_CIK_CP, /* for cp queues and diq */ + KFD_MQD_TYPE_CIK_SDMA, /* for sdma queues */ + KFD_MQD_TYPE_MAX +}; + +struct scheduling_resources { + unsigned int vmid_mask; + enum kfd_queue_type type; + uint64_t queue_mask; + uint64_t gws_mask; + uint32_t oac_mask; + uint32_t gds_heap_base; + uint32_t gds_heap_size; +}; + +struct process_queue_manager { + /* data */ + struct kfd_process *process; + unsigned int num_concurrent_processes; + struct list_head queues; + unsigned long *queue_slot_bitmap; +}; + +struct qcm_process_device { + /* The Device Queue Manager that owns this data */ + struct device_queue_manager *dqm; + struct process_queue_manager *pqm; + /* Device Queue Manager lock */ + struct mutex *lock; + /* Queues list */ + struct list_head queues_list; + struct list_head priv_queue_list; + + unsigned int queue_count; + unsigned int vmid; + bool is_debug; + /* + * All the memory management data should be here too + */ + uint64_t gds_context_area; + uint32_t sh_mem_config; + uint32_t sh_mem_bases; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t page_table_base; + uint32_t gds_size; + uint32_t num_gws; + uint32_t num_oac; +}; + +/* Data that is per-process-per device. */ +struct kfd_process_device { + /* + * List of all per-device data for a process. + * Starts from kfd_process.per_device_data. + */ + struct list_head per_device_list; + + /* The device that owns this data. */ + struct kfd_dev *dev; + + + /* per-process-per device QCM data structure */ + struct qcm_process_device qpd; + + /*Apertures*/ + uint64_t lds_base; + uint64_t lds_limit; + uint64_t gpuvm_base; + uint64_t gpuvm_limit; + uint64_t scratch_base; + uint64_t scratch_limit; + + /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ + bool bound; +}; + +#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) + +/* Process data */ +struct kfd_process { + /* + * kfd_process are stored in an mm_struct*->kfd_process* + * hash table (kfd_processes in kfd_process.c) + */ + struct hlist_node kfd_processes; + + struct mm_struct *mm; + + struct mutex mutex; + + /* + * In any process, the thread that started main() is the lead + * thread and outlives the rest. + * It is here because amd_iommu_bind_pasid wants a task_struct. + */ + struct task_struct *lead_thread; + + /* We want to receive a notification when the mm_struct is destroyed */ + struct mmu_notifier mmu_notifier; + + /* Use for delayed freeing of kfd_process structure */ + struct rcu_head rcu; + + unsigned int pasid; + + /* + * List of kfd_process_device structures, + * one for each device the process is using. + */ + struct list_head per_device_data; + + struct process_queue_manager pqm; + + /* The process's queues. */ + size_t queue_array_size; + + /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ + struct kfd_queue **queues; + + unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; + + /*Is the user space process 32 bit?*/ + bool is_32bit_user_mode; +}; + +void kfd_process_create_wq(void); +void kfd_process_destroy_wq(void); +struct kfd_process *kfd_create_process(const struct task_struct *); +struct kfd_process *kfd_get_process(const struct task_struct *); + +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p); +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p, + int create_pdd); + +/* Process device data iterator */ +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, + struct kfd_process_device *pdd); +bool kfd_has_process_device_data(struct kfd_process *p); + +/* PASIDs */ +int kfd_pasid_init(void); +void kfd_pasid_exit(void); +bool kfd_set_pasid_limit(unsigned int new_limit); +unsigned int kfd_get_pasid_limit(void); +unsigned int kfd_pasid_alloc(void); +void kfd_pasid_free(unsigned int pasid); + +/* Doorbells */ +void kfd_doorbell_init(struct kfd_dev *kfd); +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off); +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); +u32 read_kernel_doorbell(u32 __iomem *db); +void write_kernel_doorbell(u32 __iomem *db, u32 value); +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, + struct kfd_process *process, + unsigned int queue_id); + +extern struct device *kfd_device; + +/* Topology */ +int kfd_topology_init(void); +void kfd_topology_shutdown(void); +int kfd_topology_add_device(struct kfd_dev *gpu); +int kfd_topology_remove_device(struct kfd_dev *gpu); +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); + +/* Interrupts */ +int kfd_interrupt_init(struct kfd_dev *dev); +void kfd_interrupt_exit(struct kfd_dev *dev); +void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); +bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); + +/* Power Management */ +void kgd2kfd_suspend(struct kfd_dev *kfd); +int kgd2kfd_resume(struct kfd_dev *kfd); + +/* amdkfd Apertures */ +int kfd_init_apertures(struct kfd_process *process); + +/* Queue Context Management */ +inline uint32_t lower_32(uint64_t x); +inline uint32_t upper_32(uint64_t x); + +int init_queue(struct queue **q, struct queue_properties properties); +void uninit_queue(struct queue *q); +void print_queue_properties(struct queue_properties *q); +void print_queue(struct queue *q); + +struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); +void device_queue_manager_uninit(struct device_queue_manager *dqm); +struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type); +void kernel_queue_uninit(struct kernel_queue *kq); + +/* Process Queue Manager */ +struct process_queue_node { + struct queue *q; + struct kernel_queue *kq; + struct list_head process_queue_list; +}; + +int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); +void pqm_uninit(struct process_queue_manager *pqm); +int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, + unsigned int *qid); +int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); +int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); + +/* Packet Manager */ + +#define KFD_HIQ_TIMEOUT (500) + +#define KFD_FENCE_COMPLETED (100) +#define KFD_FENCE_INIT (10) +#define KFD_UNMAP_LATENCY (150) + +struct packet_manager { + struct device_queue_manager *dqm; + struct kernel_queue *priv_queue; + struct mutex lock; + bool allocated; + struct kfd_mem_obj *ib_buffer_obj; +}; + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); +void pm_uninit(struct packet_manager *pm); +int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res); +int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); +int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value); + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + enum kfd_preempt_type_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); + +void pm_release_ib(struct packet_manager *pm); + +uint64_t kfd_get_number_elems(struct kfd_dev *kfd); +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c new file mode 100644 index 000000000000..b85eb0b830b4 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -0,0 +1,410 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/mutex.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/amd-iommu.h> +#include <linux/notifier.h> +struct mm_struct; + +#include "kfd_priv.h" + +/* + * Initial size for the array of queues. + * The allocated size is doubled each time + * it is exceeded up to MAX_PROCESS_QUEUES. + */ +#define INITIAL_QUEUE_ARRAY_SIZE 16 + +/* + * List of struct kfd_process (field kfd_process). + * Unique/indexed by mm_struct* + */ +#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ +static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); +static DEFINE_MUTEX(kfd_processes_mutex); + +DEFINE_STATIC_SRCU(kfd_processes_srcu); + +static struct workqueue_struct *kfd_process_wq; + +struct kfd_process_release_work { + struct work_struct kfd_work; + struct kfd_process *p; +}; + +static struct kfd_process *find_process(const struct task_struct *thread); +static struct kfd_process *create_process(const struct task_struct *thread); + +void kfd_process_create_wq(void) +{ + if (!kfd_process_wq) + kfd_process_wq = create_workqueue("kfd_process_wq"); +} + +void kfd_process_destroy_wq(void) +{ + if (kfd_process_wq) { + flush_workqueue(kfd_process_wq); + destroy_workqueue(kfd_process_wq); + kfd_process_wq = NULL; + } +} + +struct kfd_process *kfd_create_process(const struct task_struct *thread) +{ + struct kfd_process *process; + + BUG_ON(!kfd_process_wq); + + if (thread->mm == NULL) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + + /* Take mmap_sem because we call __mmu_notifier_register inside */ + down_write(&thread->mm->mmap_sem); + + /* + * take kfd processes mutex before starting of process creation + * so there won't be a case where two threads of the same process + * create two kfd_process structures + */ + mutex_lock(&kfd_processes_mutex); + + /* A prior open of /dev/kfd could have already created the process. */ + process = find_process(thread); + if (process) + pr_debug("kfd: process already found\n"); + + if (!process) + process = create_process(thread); + + mutex_unlock(&kfd_processes_mutex); + + up_write(&thread->mm->mmap_sem); + + return process; +} + +struct kfd_process *kfd_get_process(const struct task_struct *thread) +{ + struct kfd_process *process; + + if (thread->mm == NULL) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + + process = find_process(thread); + + return process; +} + +static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) +{ + struct kfd_process *process; + + hash_for_each_possible_rcu(kfd_processes_table, process, + kfd_processes, (uintptr_t)mm) + if (process->mm == mm) + return process; + + return NULL; +} + +static struct kfd_process *find_process(const struct task_struct *thread) +{ + struct kfd_process *p; + int idx; + + idx = srcu_read_lock(&kfd_processes_srcu); + p = find_process_by_mm(thread->mm); + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; +} + +static void kfd_process_wq_release(struct work_struct *work) +{ + struct kfd_process_release_work *my_work; + struct kfd_process_device *pdd, *temp; + struct kfd_process *p; + + my_work = (struct kfd_process_release_work *) work; + + p = my_work->p; + + mutex_lock(&p->mutex); + + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); + list_del(&pdd->per_device_list); + + kfree(pdd); + } + + kfd_pasid_free(p->pasid); + + mutex_unlock(&p->mutex); + + mutex_destroy(&p->mutex); + + kfree(p->queues); + + kfree(p); + + kfree((void *)work); +} + +static void kfd_process_destroy_delayed(struct rcu_head *rcu) +{ + struct kfd_process_release_work *work; + struct kfd_process *p; + + BUG_ON(!kfd_process_wq); + + p = container_of(rcu, struct kfd_process, rcu); + BUG_ON(atomic_read(&p->mm->mm_count) <= 0); + + mmdrop(p->mm); + + work = (struct kfd_process_release_work *) + kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); + + if (work) { + INIT_WORK((struct work_struct *) work, kfd_process_wq_release); + work->p = p; + queue_work(kfd_process_wq, (struct work_struct *) work); + } +} + +static void kfd_process_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kfd_process *p; + + /* + * The kfd_process structure can not be free because the + * mmu_notifier srcu is read locked + */ + p = container_of(mn, struct kfd_process, mmu_notifier); + BUG_ON(p->mm != mm); + + mutex_lock(&kfd_processes_mutex); + hash_del_rcu(&p->kfd_processes); + mutex_unlock(&kfd_processes_mutex); + synchronize_srcu(&kfd_processes_srcu); + + mutex_lock(&p->mutex); + + /* In case our notifier is called before IOMMU notifier */ + pqm_uninit(&p->pqm); + + mutex_unlock(&p->mutex); + + /* + * Because we drop mm_count inside kfd_process_destroy_delayed + * and because the mmu_notifier_unregister function also drop + * mm_count we need to take an extra count here. + */ + atomic_inc(&p->mm->mm_count); + mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); + mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); +} + +static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { + .release = kfd_process_notifier_release, +}; + +static struct kfd_process *create_process(const struct task_struct *thread) +{ + struct kfd_process *process; + int err = -ENOMEM; + + process = kzalloc(sizeof(*process), GFP_KERNEL); + + if (!process) + goto err_alloc_process; + + process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, + sizeof(process->queues[0]), GFP_KERNEL); + if (!process->queues) + goto err_alloc_queues; + + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) + goto err_alloc_pasid; + + mutex_init(&process->mutex); + + process->mm = thread->mm; + + /* register notifier */ + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; + err = __mmu_notifier_register(&process->mmu_notifier, process->mm); + if (err) + goto err_mmu_notifier; + + hash_add_rcu(kfd_processes_table, &process->kfd_processes, + (uintptr_t)process->mm); + + process->lead_thread = thread->group_leader; + + process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; + + INIT_LIST_HEAD(&process->per_device_data); + + err = pqm_init(&process->pqm, process); + if (err != 0) + goto err_process_pqm_init; + + return process; + +err_process_pqm_init: + hash_del_rcu(&process->kfd_processes); + synchronize_rcu(); + mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); +err_mmu_notifier: + kfd_pasid_free(process->pasid); +err_alloc_pasid: + kfree(process->queues); +err_alloc_queues: + kfree(process); +err_alloc_process: + return ERR_PTR(err); +} + +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p, + int create_pdd) +{ + struct kfd_process_device *pdd = NULL; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + if (pdd->dev == dev) + return pdd; + + if (create_pdd) { + pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); + if (pdd != NULL) { + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + list_add(&pdd->per_device_list, &p->per_device_data); + } + } + + return pdd; +} + +/* + * Direct the IOMMU to bind the process (specifically the pasid->mm) + * to the device. + * Unbinding occurs when the process dies or the device is removed. + * + * Assumes that the process lock is held. + */ +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p, 1); + int err; + + if (pdd == NULL) + return ERR_PTR(-ENOMEM); + + if (pdd->bound) + return pdd; + + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (err < 0) + return ERR_PTR(err); + + pdd->bound = true; + + return pdd; +} + +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) +{ + struct kfd_process *p; + struct kfd_process_device *pdd; + int idx, i; + + BUG_ON(dev == NULL); + + idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, i, p, kfd_processes) + if (p->pasid == pasid) + break; + + srcu_read_unlock(&kfd_processes_srcu, idx); + + BUG_ON(p->pasid != pasid); + + mutex_lock(&p->mutex); + + pqm_uninit(&p->pqm); + + pdd = kfd_get_process_device_data(dev, p, 0); + + /* + * Just mark pdd as unbound, because we still need it to call + * amd_iommu_unbind_pasid() in when the process exits. + * We don't call amd_iommu_unbind_pasid() here + * because the IOMMU called us. + */ + if (pdd) + pdd->bound = false; + + mutex_unlock(&p->mutex); +} + +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) +{ + return list_first_entry(&p->per_device_data, + struct kfd_process_device, + per_device_list); +} + +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, + struct kfd_process_device *pdd) +{ + if (list_is_last(&pdd->per_device_list, &p->per_device_data)) + return NULL; + return list_next_entry(pdd, per_device_list); +} + +bool kfd_has_process_device_data(struct kfd_process *p) +{ + return !(list_empty(&p->per_device_data)); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c new file mode 100644 index 000000000000..47526780d736 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -0,0 +1,343 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include <linux/list.h> +#include "kfd_device_queue_manager.h" +#include "kfd_priv.h" +#include "kfd_kernel_queue.h" + +static inline struct process_queue_node *get_queue_by_qid( + struct process_queue_manager *pqm, unsigned int qid) +{ + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { + if (pqn->q && pqn->q->properties.queue_id == qid) + return pqn; + if (pqn->kq && pqn->kq->queue->properties.queue_id == qid) + return pqn; + } + + return NULL; +} + +static int find_available_queue_slot(struct process_queue_manager *pqm, + unsigned int *qid) +{ + unsigned long found; + + BUG_ON(!pqm || !qid); + + pr_debug("kfd: in %s\n", __func__); + + found = find_first_zero_bit(pqm->queue_slot_bitmap, + max_num_of_queues_per_process); + + pr_debug("kfd: the new slot id %lu\n", found); + + if (found >= max_num_of_queues_per_process) { + pr_info("amdkfd: Can not open more queues for process with pasid %d\n", + pqm->process->pasid); + return -ENOMEM; + } + + set_bit(found, pqm->queue_slot_bitmap); + *qid = found; + + return 0; +} + +int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) +{ + BUG_ON(!pqm); + + INIT_LIST_HEAD(&pqm->queues); + pqm->queue_slot_bitmap = + kzalloc(DIV_ROUND_UP(max_num_of_queues_per_process, + BITS_PER_BYTE), GFP_KERNEL); + if (pqm->queue_slot_bitmap == NULL) + return -ENOMEM; + pqm->process = p; + + return 0; +} + +void pqm_uninit(struct process_queue_manager *pqm) +{ + int retval; + struct process_queue_node *pqn, *next; + + BUG_ON(!pqm); + + pr_debug("In func %s\n", __func__); + + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { + retval = pqm_destroy_queue( + pqm, + (pqn->q != NULL) ? + pqn->q->properties.queue_id : + pqn->kq->queue->properties.queue_id); + + if (retval != 0) { + pr_err("kfd: failed to destroy queue\n"); + return; + } + } + kfree(pqm->queue_slot_bitmap); + pqm->queue_slot_bitmap = NULL; +} + +static int create_cp_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, struct queue **q, + struct queue_properties *q_properties, + struct file *f, unsigned int qid) +{ + int retval; + + retval = 0; + + /* Doorbell initialized in user space*/ + q_properties->doorbell_ptr = NULL; + + q_properties->doorbell_off = + kfd_queue_id_to_doorbell(dev, pqm->process, qid); + + /* let DQM handle it*/ + q_properties->vmid = 0; + q_properties->queue_id = qid; + q_properties->type = KFD_QUEUE_TYPE_COMPUTE; + + retval = init_queue(q, *q_properties); + if (retval != 0) + goto err_init_queue; + + (*q)->device = dev; + (*q)->process = pqm->process; + + pr_debug("kfd: PQM After init queue"); + + return retval; + +err_init_queue: + return retval; +} + +int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, + unsigned int *qid) +{ + int retval; + struct kfd_process_device *pdd; + struct queue_properties q_properties; + struct queue *q; + struct process_queue_node *pqn; + struct kernel_queue *kq; + + BUG_ON(!pqm || !dev || !properties || !qid); + + memset(&q_properties, 0, sizeof(struct queue_properties)); + memcpy(&q_properties, properties, sizeof(struct queue_properties)); + q = NULL; + kq = NULL; + + pdd = kfd_get_process_device_data(dev, pqm->process, 1); + BUG_ON(!pdd); + + retval = find_available_queue_slot(pqm, qid); + if (retval != 0) + return retval; + + if (list_empty(&pqm->queues)) { + pdd->qpd.pqm = pqm; + dev->dqm->register_process(dev->dqm, &pdd->qpd); + } + + pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); + if (!pqn) { + retval = -ENOMEM; + goto err_allocate_pqn; + } + + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + /* check if there is over subscription */ + if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + ((dev->dqm->processes_count >= VMID_PER_DEVICE) || + (dev->dqm->queue_count >= PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE))) { + pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->create_queue(dev->dqm, q, &pdd->qpd, + &q->properties.vmid); + print_queue(q); + break; + case KFD_QUEUE_TYPE_DIQ: + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); + if (kq == NULL) { + retval = -ENOMEM; + goto err_create_queue; + } + kq->queue->properties.queue_id = *qid; + pqn->kq = kq; + pqn->q = NULL; + retval = dev->dqm->create_kernel_queue(dev->dqm, kq, &pdd->qpd); + break; + default: + BUG(); + break; + } + + if (retval != 0) { + pr_err("kfd: error dqm create queue\n"); + goto err_create_queue; + } + + pr_debug("kfd: PQM After DQM create queue\n"); + + list_add(&pqn->process_queue_list, &pqm->queues); + + if (q) { + *properties = q->properties; + pr_debug("kfd: PQM done creating queue\n"); + print_queue_properties(properties); + } + + return retval; + +err_create_queue: + kfree(pqn); +err_allocate_pqn: + clear_bit(*qid, pqm->queue_slot_bitmap); + return retval; +} + +int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) +{ + struct process_queue_node *pqn; + struct kfd_process_device *pdd; + struct device_queue_manager *dqm; + struct kfd_dev *dev; + int retval; + + dqm = NULL; + + BUG_ON(!pqm); + retval = 0; + + pr_debug("kfd: In Func %s\n", __func__); + + pqn = get_queue_by_qid(pqm, qid); + if (pqn == NULL) { + pr_err("kfd: queue id does not match any known queue\n"); + return -EINVAL; + } + + dev = NULL; + if (pqn->kq) + dev = pqn->kq->dev; + if (pqn->q) + dev = pqn->q->device; + BUG_ON(!dev); + + pdd = kfd_get_process_device_data(dev, pqm->process, 1); + BUG_ON(!pdd); + + if (pqn->kq) { + /* destroy kernel queue (DIQ) */ + dqm = pqn->kq->dev->dqm; + dqm->destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); + kernel_queue_uninit(pqn->kq); + } + + if (pqn->q) { + dqm = pqn->q->device->dqm; + retval = dqm->destroy_queue(dqm, &pdd->qpd, pqn->q); + if (retval != 0) + return retval; + + uninit_queue(pqn->q); + } + + list_del(&pqn->process_queue_list); + kfree(pqn); + clear_bit(qid, pqm->queue_slot_bitmap); + + if (list_empty(&pqm->queues)) + dqm->unregister_process(dqm, &pdd->qpd); + + return retval; +} + +int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p) +{ + int retval; + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + pqn = get_queue_by_qid(pqm, qid); + BUG_ON(!pqn); + + pqn->q->properties.queue_address = p->queue_address; + pqn->q->properties.queue_size = p->queue_size; + pqn->q->properties.queue_percent = p->queue_percent; + pqn->q->properties.priority = p->priority; + + retval = pqn->q->device->dqm->update_queue(pqn->q->device->dqm, pqn->q); + if (retval != 0) + return retval; + + return 0; +} + +static __attribute__((unused)) struct kernel_queue *pqm_get_kernel_queue( + struct process_queue_manager *pqm, + unsigned int qid) +{ + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + pqn = get_queue_by_qid(pqm, qid); + if (pqn && pqn->kq) + return pqn->kq; + + return NULL; +} + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c new file mode 100644 index 000000000000..9a0c90b0702e --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -0,0 +1,85 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include "kfd_priv.h" + +void print_queue_properties(struct queue_properties *q) +{ + if (!q) + return; + + pr_debug("Printing queue properties:\n"); + pr_debug("Queue Type: %u\n", q->type); + pr_debug("Queue Size: %llu\n", q->queue_size); + pr_debug("Queue percent: %u\n", q->queue_percent); + pr_debug("Queue Address: 0x%llX\n", q->queue_address); + pr_debug("Queue Id: %u\n", q->queue_id); + pr_debug("Queue Process Vmid: %u\n", q->vmid); + pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr); + pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr); + pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); + pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); +} + +void print_queue(struct queue *q) +{ + if (!q) + return; + pr_debug("Printing queue:\n"); + pr_debug("Queue Type: %u\n", q->properties.type); + pr_debug("Queue Size: %llu\n", q->properties.queue_size); + pr_debug("Queue percent: %u\n", q->properties.queue_percent); + pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); + pr_debug("Queue Id: %u\n", q->properties.queue_id); + pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); + pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr); + pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr); + pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); + pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); + pr_debug("Queue MQD Address: 0x%p\n", q->mqd); + pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr); + pr_debug("Queue Process Address: 0x%p\n", q->process); + pr_debug("Queue Device Address: 0x%p\n", q->device); +} + +int init_queue(struct queue **q, struct queue_properties properties) +{ + struct queue *tmp; + + BUG_ON(!q); + + tmp = kzalloc(sizeof(struct queue), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + memcpy(&tmp->properties, &properties, sizeof(struct queue_properties)); + + *q = tmp; + return 0; +} + +void uninit_queue(struct queue *q) +{ + kfree(q); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c new file mode 100644 index 000000000000..5733e2859e8a --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -0,0 +1,1235 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/errno.h> +#include <linux/acpi.h> +#include <linux/hash.h> +#include <linux/cpufreq.h> + +#include "kfd_priv.h" +#include "kfd_crat.h" +#include "kfd_topology.h" + +static struct list_head topology_device_list; +static int topology_crat_parsed; +static struct kfd_system_properties sys_props; + +static DECLARE_RWSEM(topology_lock); + +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +{ + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu_id == gpu_id) { + device = top_dev->gpu; + break; + } + + up_read(&topology_lock); + + return device; +} + +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +{ + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu->pdev == pdev) { + device = top_dev->gpu; + break; + } + + up_read(&topology_lock); + + return device; +} + +static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) +{ + struct acpi_table_header *crat_table; + acpi_status status; + + if (!size) + return -EINVAL; + + /* + * Fetch the CRAT table from ACPI + */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_warn("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + if (*size >= crat_table->length && crat_image != NULL) + memcpy(crat_image, crat_table, crat_table->length); + + *size = crat_table->length; + + return 0; +} + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.mem_banks_count = cu->num_banks; + dev->node_props.array_count = cu->num_arrays; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, + cu->processor_id_low); +} + +/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) +{ + struct kfd_topology_device *dev; + int i = 0; + + BUG_ON(!cu); + + pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, &topology_device_list, list) { + if (cu->proximity_domain == i) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); + break; + } + i++; + } + + return 0; +} + +/* + * kfd_parse_subtype_mem is called when the topology mutex is + * already acquired + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + int i = 0; + + BUG_ON(!mem); + + pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->promixity_domain); + list_for_each_entry(dev, &topology_device_list, list) { + if (mem->promixity_domain == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + if (dev->node_props.cpu_cores_count == 0) + props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; + else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + props->size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + props->width = mem->width; + + dev->mem_bank_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + i++; + } + + return 0; +} + +/* + * kfd_parse_subtype_cache is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) +{ + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; + + BUG_ON(!cache); + + id = cache->processor_id_low; + + pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, &topology_device_list, list) + if (id == dev->node_props.cpu_core_id_base || + id == dev->node_props.simd_id_base) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); + + break; + } + + return 0; +} + +/* + * kfd_parse_subtype_iolink is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) +{ + struct kfd_iolink_properties *props; + struct kfd_topology_device *dev; + uint32_t i = 0; + uint32_t id_from; + uint32_t id_to; + + BUG_ON(!iolink); + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); + list_for_each_entry(dev, &topology_device_list, list) { + if (id_from == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + + /* + * weight factor (derived from CDIR), currently always 1 + */ + props->weight = 1; + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + + break; + } + i++; + } + + return 0; +} + +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + BUG_ON(!sub_type_hdr); + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink); + break; + default: + pr_warn("Unknown subtype (%d) in CRAT\n", + sub_type_hdr->type); + } + + return ret; +} + +static void kfd_release_topology_device(struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *mem; + struct kfd_cache_properties *cache; + struct kfd_iolink_properties *iolink; + + BUG_ON(!dev); + + list_del(&dev->list); + + while (dev->mem_props.next != &dev->mem_props) { + mem = container_of(dev->mem_props.next, + struct kfd_mem_properties, list); + list_del(&mem->list); + kfree(mem); + } + + while (dev->cache_props.next != &dev->cache_props) { + cache = container_of(dev->cache_props.next, + struct kfd_cache_properties, list); + list_del(&cache->list); + kfree(cache); + } + + while (dev->io_link_props.next != &dev->io_link_props) { + iolink = container_of(dev->io_link_props.next, + struct kfd_iolink_properties, list); + list_del(&iolink->list); + kfree(iolink); + } + + kfree(dev); + + sys_props.num_devices--; +} + +static void kfd_release_live_view(void) +{ + struct kfd_topology_device *dev; + + while (topology_device_list.next != &topology_device_list) { + dev = container_of(topology_device_list.next, + struct kfd_topology_device, list); + kfd_release_topology_device(dev); +} + + memset(&sys_props, 0, sizeof(sys_props)); +} + +static struct kfd_topology_device *kfd_create_topology_device(void) +{ + struct kfd_topology_device *dev; + + dev = kfd_alloc_struct(dev); + if (dev == NULL) { + pr_err("No memory to allocate a topology device"); + return NULL; + } + + INIT_LIST_HEAD(&dev->mem_props); + INIT_LIST_HEAD(&dev->cache_props); + INIT_LIST_HEAD(&dev->io_link_props); + + list_add_tail(&dev->list, &topology_device_list); + sys_props.num_devices++; + + return dev; +} + +static int kfd_parse_crat_table(void *crat_image) +{ + struct kfd_topology_device *top_dev; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + + if (!crat_image) + return -EINVAL; + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(); + if (!top_dev) { + kfd_release_live_view(); + return -ENOMEM; + } + } + + sys_props.platform_id = + (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); + sys_props.platform_rev = crat_table->revision; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr); + if (ret != 0) { + kfd_release_live_view(); + return ret; + } + } + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + + sys_props.generation_count++; + topology_crat_parsed = 1; + + return 0; +} + + +#define sysfs_show_gen_prop(buffer, fmt, ...) \ + snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) +#define sysfs_show_32bit_prop(buffer, name, value) \ + sysfs_show_gen_prop(buffer, "%s %u\n", name, value) +#define sysfs_show_64bit_prop(buffer, name, value) \ + sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) +#define sysfs_show_32bit_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%u\n", value) +#define sysfs_show_str_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%s\n", value) + +static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + if (attr == &sys_props.attr_genid) { + ret = sysfs_show_32bit_val(buffer, sys_props.generation_count); + } else if (attr == &sys_props.attr_props) { + sysfs_show_64bit_prop(buffer, "platform_oem", + sys_props.platform_oem); + sysfs_show_64bit_prop(buffer, "platform_id", + sys_props.platform_id); + ret = sysfs_show_64bit_prop(buffer, "platform_rev", + sys_props.platform_rev); + } else { + ret = -EINVAL; + } + + return ret; +} + +static const struct sysfs_ops sysprops_ops = { + .show = sysprops_show, +}; + +static struct kobj_type sysprops_type = { + .sysfs_ops = &sysprops_ops, +}; + +static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_iolink_properties *iolink; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + iolink = container_of(attr, struct kfd_iolink_properties, attr); + sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type); + sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj); + sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min); + sysfs_show_32bit_prop(buffer, "node_from", iolink->node_from); + sysfs_show_32bit_prop(buffer, "node_to", iolink->node_to); + sysfs_show_32bit_prop(buffer, "weight", iolink->weight); + sysfs_show_32bit_prop(buffer, "min_latency", iolink->min_latency); + sysfs_show_32bit_prop(buffer, "max_latency", iolink->max_latency); + sysfs_show_32bit_prop(buffer, "min_bandwidth", iolink->min_bandwidth); + sysfs_show_32bit_prop(buffer, "max_bandwidth", iolink->max_bandwidth); + sysfs_show_32bit_prop(buffer, "recommended_transfer_size", + iolink->rec_transfer_size); + ret = sysfs_show_32bit_prop(buffer, "flags", iolink->flags); + + return ret; +} + +static const struct sysfs_ops iolink_ops = { + .show = iolink_show, +}; + +static struct kobj_type iolink_type = { + .sysfs_ops = &iolink_ops, +}; + +static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_mem_properties *mem; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + mem = container_of(attr, struct kfd_mem_properties, attr); + sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, "flags", mem->flags); + sysfs_show_32bit_prop(buffer, "width", mem->width); + ret = sysfs_show_32bit_prop(buffer, "mem_clk_max", mem->mem_clk_max); + + return ret; +} + +static const struct sysfs_ops mem_ops = { + .show = mem_show, +}; + +static struct kobj_type mem_type = { + .sysfs_ops = &mem_ops, +}; + +static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + uint32_t i; + struct kfd_cache_properties *cache; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + cache = container_of(attr, struct kfd_cache_properties, attr); + sysfs_show_32bit_prop(buffer, "processor_id_low", + cache->processor_id_low); + sysfs_show_32bit_prop(buffer, "level", cache->cache_level); + sysfs_show_32bit_prop(buffer, "size", cache->cache_size); + sysfs_show_32bit_prop(buffer, "cache_line_size", cache->cacheline_size); + sysfs_show_32bit_prop(buffer, "cache_lines_per_tag", + cache->cachelines_per_tag); + sysfs_show_32bit_prop(buffer, "association", cache->cache_assoc); + sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, "type", cache->cache_type); + snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); + for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) + ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", + buffer, cache->sibling_map[i], + (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? + "\n" : ","); + + return ret; +} + +static const struct sysfs_ops cache_ops = { + .show = kfd_cache_show, +}; + +static struct kobj_type cache_type = { + .sysfs_ops = &cache_ops, +}; + +static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_topology_device *dev; + char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t i; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + if (strcmp(attr->name, "gpu_id") == 0) { + dev = container_of(attr, struct kfd_topology_device, + attr_gpuid); + ret = sysfs_show_32bit_val(buffer, dev->gpu_id); + } else if (strcmp(attr->name, "name") == 0) { + dev = container_of(attr, struct kfd_topology_device, + attr_name); + for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) { + public_name[i] = + (char)dev->node_props.marketing_name[i]; + if (dev->node_props.marketing_name[i] == 0) + break; + } + public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0; + ret = sysfs_show_str_val(buffer, public_name); + } else { + dev = container_of(attr, struct kfd_topology_device, + attr_props); + sysfs_show_32bit_prop(buffer, "cpu_cores_count", + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, "simd_count", + dev->node_props.simd_count); + + if (dev->mem_bank_count < dev->node_props.mem_banks_count) { + pr_warn("kfd: mem_banks_count truncated from %d to %d\n", + dev->node_props.mem_banks_count, + dev->mem_bank_count); + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->mem_bank_count); + } else { + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); + } + + sysfs_show_32bit_prop(buffer, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, "io_links_count", + dev->node_props.io_links_count); + sysfs_show_32bit_prop(buffer, "cpu_core_id_base", + dev->node_props.cpu_core_id_base); + sysfs_show_32bit_prop(buffer, "simd_id_base", + dev->node_props.simd_id_base); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + sysfs_show_32bit_prop(buffer, "max_waves_per_simd", + dev->node_props.max_waves_per_simd); + sysfs_show_32bit_prop(buffer, "lds_size_in_kb", + dev->node_props.lds_size_in_kb); + sysfs_show_32bit_prop(buffer, "gds_size_in_kb", + dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, "wave_front_size", + dev->node_props.wave_front_size); + sysfs_show_32bit_prop(buffer, "array_count", + dev->node_props.array_count); + sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", + dev->node_props.simd_arrays_per_engine); + sysfs_show_32bit_prop(buffer, "cu_per_simd_array", + dev->node_props.cu_per_simd_array); + sysfs_show_32bit_prop(buffer, "simd_per_cu", + dev->node_props.simd_per_cu); + sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", + dev->node_props.max_slots_scratch_cu); + sysfs_show_32bit_prop(buffer, "engine_id", + dev->node_props.engine_id); + sysfs_show_32bit_prop(buffer, "vendor_id", + dev->node_props.vendor_id); + sysfs_show_32bit_prop(buffer, "device_id", + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, "location_id", + dev->node_props.location_id); + + if (dev->gpu) { + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + kfd2kgd->get_max_engine_clock_in_mhz( + dev->gpu->kgd)); + sysfs_show_64bit_prop(buffer, "local_mem_size", + kfd2kgd->get_vmem_size(dev->gpu->kgd)); + } + + ret = sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", + cpufreq_quick_get_max(0)/1000); + } + + return ret; +} + +static const struct sysfs_ops node_ops = { + .show = node_show, +}; + +static struct kobj_type node_type = { + .sysfs_ops = &node_ops, +}; + +static void kfd_remove_sysfs_file(struct kobject *kobj, struct attribute *attr) +{ + sysfs_remove_file(kobj, attr); + kobject_del(kobj); + kobject_put(kobj); +} + +static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) +{ + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; + + BUG_ON(!dev); + + if (dev->kobj_iolink) { + list_for_each_entry(iolink, &dev->io_link_props, list) + if (iolink->kobj) { + kfd_remove_sysfs_file(iolink->kobj, + &iolink->attr); + iolink->kobj = NULL; + } + kobject_del(dev->kobj_iolink); + kobject_put(dev->kobj_iolink); + dev->kobj_iolink = NULL; + } + + if (dev->kobj_cache) { + list_for_each_entry(cache, &dev->cache_props, list) + if (cache->kobj) { + kfd_remove_sysfs_file(cache->kobj, + &cache->attr); + cache->kobj = NULL; + } + kobject_del(dev->kobj_cache); + kobject_put(dev->kobj_cache); + dev->kobj_cache = NULL; + } + + if (dev->kobj_mem) { + list_for_each_entry(mem, &dev->mem_props, list) + if (mem->kobj) { + kfd_remove_sysfs_file(mem->kobj, &mem->attr); + mem->kobj = NULL; + } + kobject_del(dev->kobj_mem); + kobject_put(dev->kobj_mem); + dev->kobj_mem = NULL; + } + + if (dev->kobj_node) { + sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); + sysfs_remove_file(dev->kobj_node, &dev->attr_name); + sysfs_remove_file(dev->kobj_node, &dev->attr_props); + kobject_del(dev->kobj_node); + kobject_put(dev->kobj_node); + dev->kobj_node = NULL; + } +} + +static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + uint32_t id) +{ + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; + int ret; + uint32_t i; + + BUG_ON(!dev); + + /* + * Creating the sysfs folders + */ + BUG_ON(dev->kobj_node); + dev->kobj_node = kfd_alloc_struct(dev->kobj_node); + if (!dev->kobj_node) + return -ENOMEM; + + ret = kobject_init_and_add(dev->kobj_node, &node_type, + sys_props.kobj_nodes, "%d", id); + if (ret < 0) + return ret; + + dev->kobj_mem = kobject_create_and_add("mem_banks", dev->kobj_node); + if (!dev->kobj_mem) + return -ENOMEM; + + dev->kobj_cache = kobject_create_and_add("caches", dev->kobj_node); + if (!dev->kobj_cache) + return -ENOMEM; + + dev->kobj_iolink = kobject_create_and_add("io_links", dev->kobj_node); + if (!dev->kobj_iolink) + return -ENOMEM; + + /* + * Creating sysfs files for node properties + */ + dev->attr_gpuid.name = "gpu_id"; + dev->attr_gpuid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_gpuid); + dev->attr_name.name = "name"; + dev->attr_name.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_name); + dev->attr_props.name = "properties"; + dev->attr_props.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_props); + ret = sysfs_create_file(dev->kobj_node, &dev->attr_gpuid); + if (ret < 0) + return ret; + ret = sysfs_create_file(dev->kobj_node, &dev->attr_name); + if (ret < 0) + return ret; + ret = sysfs_create_file(dev->kobj_node, &dev->attr_props); + if (ret < 0) + return ret; + + i = 0; + list_for_each_entry(mem, &dev->mem_props, list) { + mem->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!mem->kobj) + return -ENOMEM; + ret = kobject_init_and_add(mem->kobj, &mem_type, + dev->kobj_mem, "%d", i); + if (ret < 0) + return ret; + + mem->attr.name = "properties"; + mem->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&mem->attr); + ret = sysfs_create_file(mem->kobj, &mem->attr); + if (ret < 0) + return ret; + i++; + } + + i = 0; + list_for_each_entry(cache, &dev->cache_props, list) { + cache->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!cache->kobj) + return -ENOMEM; + ret = kobject_init_and_add(cache->kobj, &cache_type, + dev->kobj_cache, "%d", i); + if (ret < 0) + return ret; + + cache->attr.name = "properties"; + cache->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&cache->attr); + ret = sysfs_create_file(cache->kobj, &cache->attr); + if (ret < 0) + return ret; + i++; + } + + i = 0; + list_for_each_entry(iolink, &dev->io_link_props, list) { + iolink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!iolink->kobj) + return -ENOMEM; + ret = kobject_init_and_add(iolink->kobj, &iolink_type, + dev->kobj_iolink, "%d", i); + if (ret < 0) + return ret; + + iolink->attr.name = "properties"; + iolink->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&iolink->attr); + ret = sysfs_create_file(iolink->kobj, &iolink->attr); + if (ret < 0) + return ret; + i++; +} + + return 0; +} + +static int kfd_build_sysfs_node_tree(void) +{ + struct kfd_topology_device *dev; + int ret; + uint32_t i = 0; + + list_for_each_entry(dev, &topology_device_list, list) { + ret = kfd_build_sysfs_node_entry(dev, 0); + if (ret < 0) + return ret; + i++; + } + + return 0; +} + +static void kfd_remove_sysfs_node_tree(void) +{ + struct kfd_topology_device *dev; + + list_for_each_entry(dev, &topology_device_list, list) + kfd_remove_sysfs_node_entry(dev); +} + +static int kfd_topology_update_sysfs(void) +{ + int ret; + + pr_info("Creating topology SYSFS entries\n"); + if (sys_props.kobj_topology == NULL) { + sys_props.kobj_topology = + kfd_alloc_struct(sys_props.kobj_topology); + if (!sys_props.kobj_topology) + return -ENOMEM; + + ret = kobject_init_and_add(sys_props.kobj_topology, + &sysprops_type, &kfd_device->kobj, + "topology"); + if (ret < 0) + return ret; + + sys_props.kobj_nodes = kobject_create_and_add("nodes", + sys_props.kobj_topology); + if (!sys_props.kobj_nodes) + return -ENOMEM; + + sys_props.attr_genid.name = "generation_id"; + sys_props.attr_genid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&sys_props.attr_genid); + ret = sysfs_create_file(sys_props.kobj_topology, + &sys_props.attr_genid); + if (ret < 0) + return ret; + + sys_props.attr_props.name = "system_properties"; + sys_props.attr_props.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&sys_props.attr_props); + ret = sysfs_create_file(sys_props.kobj_topology, + &sys_props.attr_props); + if (ret < 0) + return ret; + } + + kfd_remove_sysfs_node_tree(); + + return kfd_build_sysfs_node_tree(); +} + +static void kfd_topology_release_sysfs(void) +{ + kfd_remove_sysfs_node_tree(); + if (sys_props.kobj_topology) { + sysfs_remove_file(sys_props.kobj_topology, + &sys_props.attr_genid); + sysfs_remove_file(sys_props.kobj_topology, + &sys_props.attr_props); + if (sys_props.kobj_nodes) { + kobject_del(sys_props.kobj_nodes); + kobject_put(sys_props.kobj_nodes); + sys_props.kobj_nodes = NULL; + } + kobject_del(sys_props.kobj_topology); + kobject_put(sys_props.kobj_topology); + sys_props.kobj_topology = NULL; + } +} + +int kfd_topology_init(void) +{ + void *crat_image = NULL; + size_t image_size = 0; + int ret; + + /* + * Initialize the head for the topology device list + */ + INIT_LIST_HEAD(&topology_device_list); + init_rwsem(&topology_lock); + topology_crat_parsed = 0; + + memset(&sys_props, 0, sizeof(sys_props)); + + /* + * Get the CRAT image from the ACPI + */ + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + if (ret == 0 && image_size > 0) { + pr_info("Found CRAT image with size=%zd\n", image_size); + crat_image = kmalloc(image_size, GFP_KERNEL); + if (!crat_image) { + ret = -ENOMEM; + pr_err("No memory for allocating CRAT image\n"); + goto err; + } + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + + if (ret == 0) { + down_write(&topology_lock); + ret = kfd_parse_crat_table(crat_image); + if (ret == 0) + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); + } + kfree(crat_image); + } else if (ret == -ENODATA) { + ret = 0; + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); + } + +err: + pr_info("Finished initializing topology ret=%d\n", ret); + return ret; +} + +void kfd_topology_shutdown(void) +{ + kfd_topology_release_sysfs(); + kfd_release_live_view(); +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + uint32_t i = 0; + + pr_info("DEBUG PRINT OF TOPOLOGY:"); + list_for_each_entry(dev, &topology_device_list, list) { + pr_info("Node: %d\n", i); + pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); + pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); + pr_info("\tSIMD count: %d", dev->node_props.simd_count); + i++; + } +} + +static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) +{ + uint32_t hashout; + uint32_t buf[7]; + int i; + + if (!gpu) + return 0; + + buf[0] = gpu->pdev->devfn; + buf[1] = gpu->pdev->subsystem_vendor; + buf[2] = gpu->pdev->subsystem_device; + buf[3] = gpu->pdev->device; + buf[4] = gpu->pdev->bus->number; + buf[5] = (uint32_t)(kfd2kgd->get_vmem_size(gpu->kgd) & 0xffffffff); + buf[6] = (uint32_t)(kfd2kgd->get_vmem_size(gpu->kgd) >> 32); + + for (i = 0, hashout = 0; i < 7; i++) + hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + + return hashout; +} + +static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev; + struct kfd_topology_device *out_dev = NULL; + + BUG_ON(!gpu); + + list_for_each_entry(dev, &topology_device_list, list) + if (dev->gpu == NULL && dev->node_props.simd_count > 0) { + dev->gpu = gpu; + out_dev = dev; + break; + } + + return out_dev; +} + +static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) +{ + /* + * TODO: Generate an event for thunk about the arrival/removal + * of the GPU + */ +} + +int kfd_topology_add_device(struct kfd_dev *gpu) +{ + uint32_t gpu_id; + struct kfd_topology_device *dev; + int res; + + BUG_ON(!gpu); + + gpu_id = kfd_generate_gpu_id(gpu); + + pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + + down_write(&topology_lock); + /* + * Try to assign the GPU to existing topology device (generated from + * CRAT table + */ + dev = kfd_assign_gpu(gpu); + if (!dev) { + pr_info("GPU was not found in the current topology. Extending.\n"); + kfd_debug_print_topology(); + dev = kfd_create_topology_device(); + if (!dev) { + res = -ENOMEM; + goto err; + } + dev->gpu = gpu; + + /* + * TODO: Make a call to retrieve topology information from the + * GPU vBIOS + */ + + /* + * Update the SYSFS tree, since we added another topology device + */ + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + + } + + dev->gpu_id = gpu_id; + gpu->id = gpu_id; + dev->node_props.vendor_id = gpu->pdev->vendor; + dev->node_props.device_id = gpu->pdev->device; + dev->node_props.location_id = (gpu->pdev->bus->number << 24) + + (gpu->pdev->devfn & 0xffffff); + /* + * TODO: Retrieve max engine clock values from KGD + */ + + res = 0; + +err: + up_write(&topology_lock); + + if (res == 0) + kfd_notify_gpu_change(gpu_id, 1); + + return res; +} + +int kfd_topology_remove_device(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev; + uint32_t gpu_id; + int res = -ENODEV; + + BUG_ON(!gpu); + + down_write(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) + if (dev->gpu == gpu) { + gpu_id = dev->gpu_id; + kfd_remove_sysfs_node_entry(dev); + kfd_release_topology_device(dev); + res = 0; + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + break; + } + + up_write(&topology_lock); + + if (res == 0) + kfd_notify_gpu_change(gpu_id, 0); + + return res; +} + +/* + * When idx is out of bounds, the function will return NULL + */ +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) +{ + + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + uint8_t device_idx = 0; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) { + if (device_idx == idx) { + device = top_dev->gpu; + break; + } + + device_idx++; + } + + up_read(&topology_lock); + + return device; + +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h new file mode 100644 index 000000000000..989624b3cd14 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -0,0 +1,168 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __KFD_TOPOLOGY_H__ +#define __KFD_TOPOLOGY_H__ + +#include <linux/types.h> +#include <linux/list.h> +#include "kfd_priv.h" + +#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128 + +#define HSA_CAP_HOT_PLUGGABLE 0x00000001 +#define HSA_CAP_ATS_PRESENT 0x00000002 +#define HSA_CAP_SHARED_WITH_GRAPHICS 0x00000004 +#define HSA_CAP_QUEUE_SIZE_POW2 0x00000008 +#define HSA_CAP_QUEUE_SIZE_32BIT 0x00000010 +#define HSA_CAP_QUEUE_IDLE_EVENT 0x00000020 +#define HSA_CAP_VA_LIMIT 0x00000040 +#define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +#define HSA_CAP_RESERVED 0xfffff000 + +struct kfd_node_properties { + uint32_t cpu_cores_count; + uint32_t simd_count; + uint32_t mem_banks_count; + uint32_t caches_count; + uint32_t io_links_count; + uint32_t cpu_core_id_base; + uint32_t simd_id_base; + uint32_t capability; + uint32_t max_waves_per_simd; + uint32_t lds_size_in_kb; + uint32_t gds_size_in_kb; + uint32_t wave_front_size; + uint32_t array_count; + uint32_t simd_arrays_per_engine; + uint32_t cu_per_simd_array; + uint32_t simd_per_cu; + uint32_t max_slots_scratch_cu; + uint32_t engine_id; + uint32_t vendor_id; + uint32_t device_id; + uint32_t location_id; + uint32_t max_engine_clk_fcompute; + uint32_t max_engine_clk_ccompute; + uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; +}; + +#define HSA_MEM_HEAP_TYPE_SYSTEM 0 +#define HSA_MEM_HEAP_TYPE_FB_PUBLIC 1 +#define HSA_MEM_HEAP_TYPE_FB_PRIVATE 2 +#define HSA_MEM_HEAP_TYPE_GPU_GDS 3 +#define HSA_MEM_HEAP_TYPE_GPU_LDS 4 +#define HSA_MEM_HEAP_TYPE_GPU_SCRATCH 5 + +#define HSA_MEM_FLAGS_HOT_PLUGGABLE 0x00000001 +#define HSA_MEM_FLAGS_NON_VOLATILE 0x00000002 +#define HSA_MEM_FLAGS_RESERVED 0xfffffffc + +struct kfd_mem_properties { + struct list_head list; + uint32_t heap_type; + uint64_t size_in_bytes; + uint32_t flags; + uint32_t width; + uint32_t mem_clk_max; + struct kobject *kobj; + struct attribute attr; +}; + +#define KFD_TOPOLOGY_CPU_SIBLINGS 256 + +#define HSA_CACHE_TYPE_DATA 0x00000001 +#define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 +#define HSA_CACHE_TYPE_CPU 0x00000004 +#define HSA_CACHE_TYPE_HSACU 0x00000008 +#define HSA_CACHE_TYPE_RESERVED 0xfffffff0 + +struct kfd_cache_properties { + struct list_head list; + uint32_t processor_id_low; + uint32_t cache_level; + uint32_t cache_size; + uint32_t cacheline_size; + uint32_t cachelines_per_tag; + uint32_t cache_assoc; + uint32_t cache_latency; + uint32_t cache_type; + uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; + struct kobject *kobj; + struct attribute attr; +}; + +struct kfd_iolink_properties { + struct list_head list; + uint32_t iolink_type; + uint32_t ver_maj; + uint32_t ver_min; + uint32_t node_from; + uint32_t node_to; + uint32_t weight; + uint32_t min_latency; + uint32_t max_latency; + uint32_t min_bandwidth; + uint32_t max_bandwidth; + uint32_t rec_transfer_size; + uint32_t flags; + struct kobject *kobj; + struct attribute attr; +}; + +struct kfd_topology_device { + struct list_head list; + uint32_t gpu_id; + struct kfd_node_properties node_props; + uint32_t mem_bank_count; + struct list_head mem_props; + uint32_t cache_count; + struct list_head cache_props; + uint32_t io_link_count; + struct list_head io_link_props; + struct kfd_dev *gpu; + struct kobject *kobj_node; + struct kobject *kobj_mem; + struct kobject *kobj_cache; + struct kobject *kobj_iolink; + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; +}; + +struct kfd_system_properties { + uint32_t num_devices; /* Number of H-NUMA nodes */ + uint32_t generation_count; + uint64_t platform_oem; + uint64_t platform_id; + uint64_t platform_rev; + struct kobject *kobj_topology; + struct kobject *kobj_nodes; + struct attribute attr_genid; + struct attribute attr_props; +}; + + + +#endif /* __KFD_TOPOLOGY_H__ */ |