diff options
Diffstat (limited to 'drivers/accel/habanalabs/common/habanalabs_drv.c')
| -rw-r--r-- | drivers/accel/habanalabs/common/habanalabs_drv.c | 241 |
1 files changed, 138 insertions, 103 deletions
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index d9df64e75f33..0035748f3228 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -13,6 +13,12 @@ #include <linux/pci.h> #include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/version.h> + +#include <drm/drm_accel.h> +#include <drm/drm_drv.h> +#include <drm/drm_ioctl.h> #define CREATE_TRACE_POINTS #include <trace/events/habanalabs.h> @@ -26,7 +32,6 @@ MODULE_DESCRIPTION(HL_DRIVER_DESC); MODULE_LICENSE("GPL v2"); static int hl_major; -static struct class *hl_class; static DEFINE_IDR(hl_devs_idr); static DEFINE_MUTEX(hl_devs_idr_lock); @@ -54,8 +59,6 @@ module_param(boot_error_status_mask, ulong, 0444); MODULE_PARM_DESC(boot_error_status_mask, "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)"); -#define PCI_VENDOR_ID_HABANALABS 0x1da3 - #define PCI_IDS_GOYA 0x0001 #define PCI_IDS_GAUDI 0x1000 #define PCI_IDS_GAUDI_SEC 0x1010 @@ -71,6 +74,41 @@ static const struct pci_device_id ids[] = { }; MODULE_DEVICE_TABLE(pci, ids); +static const struct drm_ioctl_desc hl_drm_ioctls[] = { + DRM_IOCTL_DEF_DRV(HL_INFO, hl_info_ioctl, 0), + DRM_IOCTL_DEF_DRV(HL_CB, hl_cb_ioctl, 0), + DRM_IOCTL_DEF_DRV(HL_CS, hl_cs_ioctl, 0), + DRM_IOCTL_DEF_DRV(HL_WAIT_CS, hl_wait_ioctl, 0), + DRM_IOCTL_DEF_DRV(HL_MEMORY, hl_mem_ioctl, 0), + DRM_IOCTL_DEF_DRV(HL_DEBUG, hl_debug_ioctl, 0), +}; + +static const struct file_operations hl_fops = { + .owner = THIS_MODULE, + .open = accel_open, + .release = drm_release, + .unlocked_ioctl = drm_ioctl, + .compat_ioctl = drm_compat_ioctl, + .llseek = noop_llseek, + .mmap = hl_mmap +}; + +static const struct drm_driver hl_driver = { + .driver_features = DRIVER_COMPUTE_ACCEL, + + .name = HL_NAME, + .desc = HL_DRIVER_DESC, + .major = LINUX_VERSION_MAJOR, + .minor = LINUX_VERSION_PATCHLEVEL, + .patchlevel = LINUX_VERSION_SUBLEVEL, + + .fops = &hl_fops, + .open = hl_device_open, + .postclose = hl_device_release, + .ioctls = hl_drm_ioctls, + .num_ioctls = ARRAY_SIZE(hl_drm_ioctls) +}; + /* * get_asic_type - translate device id to asic type * @@ -102,6 +140,12 @@ static enum hl_asic_type get_asic_type(struct hl_device *hdev) case REV_ID_B: asic_type = ASIC_GAUDI2B; break; + case REV_ID_C: + asic_type = ASIC_GAUDI2C; + break; + case REV_ID_D: + asic_type = ASIC_GAUDI2D; + break; default: break; } @@ -124,43 +168,28 @@ static bool is_asic_secured(enum hl_asic_type asic_type) } /* - * hl_device_open - open function for habanalabs device - * - * @inode: pointer to inode structure - * @filp: pointer to file structure + * hl_device_open() - open function for habanalabs device. + * @ddev: pointer to DRM device structure. + * @file: pointer to DRM file private data structure. * * Called when process opens an habanalabs device. */ -int hl_device_open(struct inode *inode, struct file *filp) +int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv) { + struct hl_device *hdev = to_hl_device(ddev); enum hl_device_status status; - struct hl_device *hdev; struct hl_fpriv *hpriv; int rc; - mutex_lock(&hl_devs_idr_lock); - hdev = idr_find(&hl_devs_idr, iminor(inode)); - mutex_unlock(&hl_devs_idr_lock); - - if (!hdev) { - pr_err("Couldn't find device %d:%d\n", - imajor(inode), iminor(inode)); - return -ENXIO; - } - hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); if (!hpriv) return -ENOMEM; hpriv->hdev = hdev; - filp->private_data = hpriv; - hpriv->filp = filp; - mutex_init(&hpriv->notifier_event.lock); mutex_init(&hpriv->restore_phase_mutex); mutex_init(&hpriv->ctx_lock); kref_init(&hpriv->refcount); - nonseekable_open(inode, filp); hl_ctx_mgr_init(&hpriv->ctx_mgr); hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr); @@ -220,22 +249,22 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); - memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info)); - atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); - hdev->captured_err_info.undef_opcode.write_enable = true; + hl_enable_err_info_capture(&hdev->captured_err_info); hdev->open_counter++; hdev->last_successful_open_jif = jiffies; hdev->last_successful_open_ktime = ktime_get(); + file_priv->driver_priv = hpriv; + hpriv->file_priv = file_priv; + return 0; out_err: mutex_unlock(&hdev->fpriv_list_lock); - hl_mem_mgr_fini(&hpriv->mem_mgr); + hl_mem_mgr_fini(&hpriv->mem_mgr, NULL); hl_mem_mgr_idr_destroy(&hpriv->mem_mgr); hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); - filp->private_data = NULL; mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->restore_phase_mutex); mutex_destroy(&hpriv->notifier_event.lock); @@ -271,9 +300,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) */ hpriv->hdev = hdev; filp->private_data = hpriv; - hpriv->filp = filp; - mutex_init(&hpriv->notifier_event.lock); nonseekable_open(inode, filp); hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); @@ -307,7 +334,6 @@ static void set_driver_behavior_per_device(struct hl_device *hdev) { hdev->nic_ports_mask = 0; hdev->fw_components = FW_TYPE_ALL_TYPES; - hdev->mmu_enable = MMU_EN_ALL; hdev->cpu_queues_enable = 1; hdev->pldm = 0; hdev->hard_reset_on_fw_events = 1; @@ -321,7 +347,6 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev) hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type); hdev->major = hl_major; - hdev->hclass = hl_class; hdev->memory_scrub = memory_scrub; hdev->reset_on_lockup = reset_on_lockup; hdev->boot_error_status_mask = boot_error_status_mask; @@ -336,8 +361,7 @@ static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout) * a different default timeout for Gaudi */ if (timeout == HL_DEFAULT_TIMEOUT_LOCKED) - hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED * - MSEC_PER_SEC); + hdev->timeout_jiffies = secs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED); hdev->reset_upon_device_release = 0; break; @@ -362,7 +386,7 @@ static int fixup_device_params(struct hl_device *hdev) hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; if (tmp_timeout) - hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC); + hdev->timeout_jiffies = secs_to_jiffies(tmp_timeout); else hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; @@ -382,12 +406,36 @@ static int fixup_device_params(struct hl_device *hdev) /* If CPU queues not enabled, no way to do heartbeat */ if (!hdev->cpu_queues_enable) hdev->heartbeat = 0; - fixup_device_params_per_asic(hdev, tmp_timeout); return 0; } +static int allocate_device_id(struct hl_device *hdev) +{ + int id; + + mutex_lock(&hl_devs_idr_lock); + id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL); + mutex_unlock(&hl_devs_idr_lock); + + if (id < 0) { + if (id == -ENOSPC) + pr_err("too many devices in the system\n"); + return -EBUSY; + } + + hdev->id = id; + + /* + * Firstly initialized with the internal device ID. + * Will be updated later after the DRM device registration to hold the minor ID. + */ + hdev->cdev_idx = hdev->id; + + return 0; +} + /** * create_hdev - create habanalabs device instance * @@ -400,27 +448,29 @@ static int fixup_device_params(struct hl_device *hdev) */ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) { - int main_id, ctrl_id = 0, rc = 0; struct hl_device *hdev; + int rc; *dev = NULL; - hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); - if (!hdev) - return -ENOMEM; + hdev = devm_drm_dev_alloc(&pdev->dev, &hl_driver, struct hl_device, drm); + if (IS_ERR(hdev)) + return PTR_ERR(hdev); + + hdev->dev = hdev->drm.dev; /* Will be NULL in case of simulator device */ hdev->pdev = pdev; /* Assign status description string */ - strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], - "in device creation", HL_STR_MAX); - strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE], - "in reset after device release", HL_STR_MAX); + strscpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX); + strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX); + strscpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX); + strscpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX); + strscpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], + "in device creation", HL_STR_MAX); + strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE], + "in reset after device release", HL_STR_MAX); /* First, we must find out which ASIC are we handling. This is needed @@ -430,7 +480,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) if (hdev->asic_type == ASIC_INVALID) { dev_err(&pdev->dev, "Unsupported ASIC\n"); rc = -ENODEV; - goto free_hdev; + goto out_err; } copy_kernel_module_params_to_device(hdev); @@ -439,42 +489,15 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) fixup_device_params(hdev); - mutex_lock(&hl_devs_idr_lock); - - /* Always save 2 numbers, 1 for main device and 1 for control. - * They must be consecutive - */ - main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL); - - if (main_id >= 0) - ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1, - main_id + 2, GFP_KERNEL); - - mutex_unlock(&hl_devs_idr_lock); - - if ((main_id < 0) || (ctrl_id < 0)) { - if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC)) - pr_err("too many devices in the system\n"); - - if (main_id >= 0) { - mutex_lock(&hl_devs_idr_lock); - idr_remove(&hl_devs_idr, main_id); - mutex_unlock(&hl_devs_idr_lock); - } - - rc = -EBUSY; - goto free_hdev; - } - - hdev->id = main_id; - hdev->id_control = ctrl_id; + rc = allocate_device_id(hdev); + if (rc) + goto out_err; *dev = hdev; return 0; -free_hdev: - kfree(hdev); +out_err: return rc; } @@ -489,10 +512,8 @@ static void destroy_hdev(struct hl_device *hdev) /* Remove device from the device list */ mutex_lock(&hl_devs_idr_lock); idr_remove(&hl_devs_idr, hdev->id); - idr_remove(&hl_devs_idr, hdev->id_control); mutex_unlock(&hl_devs_idr_lock); - kfree(hdev); } static int hl_pmops_suspend(struct device *dev) @@ -653,6 +674,38 @@ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_RECOVERED; } +static void hl_pci_reset_prepare(struct pci_dev *pdev) +{ + struct hl_device *hdev; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + hdev->disabled = true; +} + +static void hl_pci_reset_done(struct pci_dev *pdev) +{ + struct hl_device *hdev; + u32 flags; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + /* + * Schedule a thread to trigger hard reset. + * The reason for this handler, is for rare cases where the driver is up + * and FLR occurs. This is valid only when working with no VM, so FW handles FLR + * and resets the device. FW will go back preboot stage, so driver needs to perform + * hard reset in order to load FW fit again. + */ + flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW; + + hl_device_reset(hdev, flags); +} + static const struct dev_pm_ops hl_pm_ops = { .suspend = hl_pmops_suspend, .resume = hl_pmops_resume, @@ -662,6 +715,8 @@ static const struct pci_error_handlers hl_pci_err_handler = { .error_detected = hl_pci_err_detected, .slot_reset = hl_pci_err_slot_reset, .resume = hl_pci_err_resume, + .reset_prepare = hl_pci_reset_prepare, + .reset_done = hl_pci_reset_done, }; static struct pci_driver hl_pci_driver = { @@ -696,28 +751,16 @@ static int __init hl_init(void) hl_major = MAJOR(dev); - hl_class = class_create(HL_NAME); - if (IS_ERR(hl_class)) { - pr_err("failed to allocate class\n"); - rc = PTR_ERR(hl_class); - goto remove_major; - } - - hl_debugfs_init(); - rc = pci_register_driver(&hl_pci_driver); if (rc) { pr_err("failed to register pci device\n"); - goto remove_debugfs; + goto remove_major; } pr_debug("driver loaded\n"); return 0; -remove_debugfs: - hl_debugfs_fini(); - class_destroy(hl_class); remove_major: unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS); return rc; @@ -730,14 +773,6 @@ static void __exit hl_exit(void) { pci_unregister_driver(&hl_pci_driver); - /* - * Removing debugfs must be after all devices or simulator devices - * have been removed because otherwise we get a bug in the - * debugfs module for referencing NULL objects - */ - hl_debugfs_fini(); - - class_destroy(hl_class); unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS); idr_destroy(&hl_devs_idr); |
