// SPDX-License-Identifier: GPL-2.0 /* * Copyright 2024 HabanaLabs, Ltd. * All Rights Reserved. */ #include "habanalabs.h" #include "hldio.h" #include #include #include #include /* * NVMe Direct I/O implementation for habanalabs driver * * ASSUMPTIONS * =========== * 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless). * 2. Only READ operations (can extend in the future). * 3. No sparse files (can overcome this in the future). * 4. Kernel version >= 6.9 * 5. Requiring page alignment is OK (I don't see a solution to this one right, * now, how do we read partial pages?) * 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel. * Theoretically I have a slight idea on how this could be solvable, but it * is probably inacceptable for the upstream. Also may not work in the end. * 7. Either make sure our cards and disks are under the same PCI bridge, or * compile a custom kernel to hack around this. */ #define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */ /* * This struct contains all the useful data I could milk out of the file handle * provided by the user. * @TODO: right now it is retrieved on each IO, but can be done once with some * dedicated IOCTL, call it for example HL_REGISTER_HANDLE. */ struct hl_dio_fd { /* Back pointer in case we need it in async completion */ struct hl_ctx *ctx; /* Associated fd struct */ struct file *filp; }; /* * This is a single IO descriptor */ struct hl_direct_io { struct hl_dio_fd f; struct kiocb kio; struct bio_vec *bv; struct iov_iter iter; u64 device_va; u64 off_bytes; u64 len_bytes; u32 type; }; bool hl_device_supports_nvme(struct hl_device *hdev) { return hdev->asic_prop.supports_nvme; } static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f) { struct hl_device *hdev = ctx->hdev; struct block_device *bd; struct super_block *sb; struct inode *inode; struct gendisk *gd; struct device *disk_dev; int rc; f->filp = fget(fd); if (!f->filp) { rc = -ENOENT; goto out; } if (!(f->filp->f_flags & O_DIRECT)) { dev_err(hdev->dev, "file is not in the direct mode\n"); rc = -EINVAL; goto fput; } if (!f->filp->f_op->read_iter) { dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n"); rc = -EINVAL; goto fput; } inode = file_inode(f->filp); sb = inode->i_sb; bd = sb->s_bdev; gd = bd->bd_disk; if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) { dev_err(hdev->dev, "sparse files are not currently supported\n"); rc = -EINVAL; goto fput; } if (!bd || !gd) { dev_err(hdev->dev, "invalid block device\n"); rc = -ENODEV; goto fput; } /* Get the underlying device from the block device */ disk_dev = disk_to_dev(gd); if (!dma_pci_p2pdma_supported(disk_dev)) { dev_err(hdev->dev, "device does not support PCI P2P DMA\n"); rc = -EOPNOTSUPP; goto fput; } /* * @TODO: Maybe we need additional checks here */ f->ctx = ctx; rc = 0; goto out; fput: fput(f->filp); out: return rc; } static void hl_dio_fd_unregister(struct hl_dio_fd *f) { fput(f->filp); } static long hl_dio_count_io(struct hl_device *hdev) { s64 sum = 0; int i; for_each_possible_cpu(i) sum += per_cpu(*hdev->hldio.inflight_ios, i); return sum; } static bool hl_dio_get_iopath(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; if (hdev->hldio.io_enabled) { this_cpu_inc(*hdev->hldio.inflight_ios); /* Avoid race conditions */ if (!hdev->hldio.io_enabled) { this_cpu_dec(*hdev->hldio.inflight_ios); return false; } hl_ctx_get(ctx); return true; } return false; } static void hl_dio_put_iopath(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; hl_ctx_put(ctx); this_cpu_dec(*hdev->hldio.inflight_ios); } static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled) { hdev->hldio.io_enabled = enabled; } static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io) { if ((u64)io->device_va & ~PAGE_MASK) { dev_dbg(hdev->dev, "device address must be 4K aligned\n"); return false; } if (io->len_bytes & ~PAGE_MASK) { dev_dbg(hdev->dev, "IO length must be 4K aligned\n"); return false; } if (io->off_bytes & ~PAGE_MASK) { dev_dbg(hdev->dev, "IO offset must be 4K aligned\n"); return false; } return true; } static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va) { struct hl_dio *hldio = &hdev->hldio; u64 device_pa; int rc, i; rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa); if (rc) { dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)", device_va, rc); return NULL; } for (i = 0 ; i < hldio->np2prs ; ++i) { if (device_pa >= hldio->p2prs[i].device_pa && device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size) return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >> PAGE_SHIFT]; } return NULL; } static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io) { u64 npages, device_va; ssize_t rc; int i; if (!hl_dio_validate_io(hdev, io)) return -EINVAL; if (!hl_dio_get_iopath(io->f.ctx)) { dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n"); return -ESHUTDOWN; } init_sync_kiocb(&io->kio, io->f.filp); io->kio.ki_pos = io->off_bytes; npages = (io->len_bytes >> PAGE_SHIFT); /* @TODO: this can be implemented smarter, vmalloc in iopath is not * ideal. Maybe some variation of genpool. Number of pages may differ * greatly, so maybe even use pools of different sizes and chose the * closest one. */ io->bv = vzalloc(npages * sizeof(struct bio_vec)); if (!io->bv) return -ENOMEM; for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) { io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va); if (!io->bv[i].bv_page) { dev_err(hdev->dev, "error getting page struct for device va %#llx", device_va); rc = -EFAULT; goto cleanup; } io->bv[i].bv_offset = 0; io->bv[i].bv_len = PAGE_SIZE; } iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes); if (io->f.filp->f_op && io->f.filp->f_op->read_iter) rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter); else rc = -EINVAL; cleanup: vfree(io->bv); hl_dio_put_iopath(io->f.ctx); dev_dbg(hdev->dev, "IO ended with %ld\n", rc); return rc; } /* * @TODO: This function can be used as a callback for io completion under * kio->ki_complete in order to implement async IO. * Note that on more recent kernels there is no ret2. */ __maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2) { struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio); dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret); /* Do something to copy result to user / notify completion */ hl_dio_put_iopath(io->f.ctx); hl_dio_fd_unregister(&io->f); } /* * DMA disk to ASIC, wait for results. Must be invoked from the user context */ int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd, u64 device_va, off_t off_bytes, size_t len_bytes, size_t *len_read) { struct hl_direct_io *io; ssize_t rc; dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes); io = kzalloc(sizeof(*io), GFP_KERNEL); if (!io) { rc = -ENOMEM; goto out; } *io = (struct hl_direct_io){ .device_va = device_va, .len_bytes = len_bytes, .off_bytes = off_bytes, .type = READ, }; rc = hl_dio_fd_register(ctx, fd, &io->f); if (rc) goto kfree_io; rc = hl_direct_io(hdev, io); if (rc >= 0) { *len_read = rc; rc = 0; } /* This shall be called only in the case of a sync IO */ hl_dio_fd_unregister(&io->f); kfree_io: kfree(io); out: return rc; } static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr) { if (p2pr->p2ppages) { vfree(p2pr->p2ppages); p2pr->p2ppages = NULL; } if (p2pr->p2pmem) { dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n", p2pr->p2pmem, p2pr->size); pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size); p2pr->p2pmem = NULL; } } void hl_p2p_region_fini_all(struct hl_device *hdev) { int i; for (i = 0 ; i < hdev->hldio.np2prs ; ++i) hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]); kvfree(hdev->hldio.p2prs); hdev->hldio.p2prs = NULL; hdev->hldio.np2prs = 0; } int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr) { void *addr; int rc, i; /* Start by publishing our p2p memory */ rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset); if (rc) { dev_err(hdev->dev, "error adding p2p resource: %d\n", rc); goto err; } /* Alloc all p2p mem */ p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size); if (!p2pr->p2pmem) { dev_err(hdev->dev, "error allocating p2p memory\n"); rc = -ENOMEM; goto err; } p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *)); if (!p2pr->p2ppages) { rc = -ENOMEM; goto err; } for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) { p2pr->p2ppages[i] = virt_to_page(addr); if (!p2pr->p2ppages[i]) { rc = -EFAULT; goto err; } } return 0; err: hl_p2p_region_fini(hdev, p2pr); return rc; } int hl_dio_start(struct hl_device *hdev) { dev_dbg(hdev->dev, "initializing HLDIO\n"); /* Initialize the IO counter and enable IO */ hdev->hldio.inflight_ios = alloc_percpu(s64); if (!hdev->hldio.inflight_ios) return -ENOMEM; hl_dio_set_io_enabled(hdev, true); return 0; } void hl_dio_stop(struct hl_device *hdev) { dev_dbg(hdev->dev, "deinitializing HLDIO\n"); if (hdev->hldio.io_enabled) { /* Wait for all the IO to finish */ hl_dio_set_io_enabled(hdev, false); hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT); } if (hdev->hldio.inflight_ios) { free_percpu(hdev->hldio.inflight_ios); hdev->hldio.inflight_ios = NULL; } }