diff options
Diffstat (limited to 'drivers/nvme/host/ioctl.c')
| -rw-r--r-- | drivers/nvme/host/ioctl.c | 342 |
1 files changed, 118 insertions, 224 deletions
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index d8ff796fd5f2..a9c097dacad6 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -3,9 +3,10 @@ * Copyright (c) 2011-2014, Intel Corporation. * Copyright (c) 2017-2021 Christoph Hellwig. */ +#include <linux/blk-integrity.h> #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> -#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include "nvme.h" enum { @@ -18,15 +19,12 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, { u32 effects; - if (capable(CAP_SYS_ADMIN)) - return true; - /* * Do not allow unprivileged passthrough on partitions, as that allows an * escape from the containment of the partition. */ if (flags & NVME_IOCTL_PARTITION) - return false; + goto admin; /* * Do not allow unprivileged processes to send vendor specific or fabrics @@ -34,7 +32,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, */ if (c->common.opcode >= nvme_cmd_vendor_start || c->common.opcode == nvme_fabrics_command) - return false; + goto admin; /* * Do not allow unprivileged passthrough of admin commands except @@ -53,7 +51,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, return true; } } - return false; + goto admin; } /* @@ -63,7 +61,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, */ effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); if (!(effects & NVME_CMD_EFFECTS_CSUPP)) - return false; + goto admin; /* * Don't allow passthrough for command that have intrusive (or unknown) @@ -72,16 +70,20 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_UUID_SEL | NVME_CMD_EFFECTS_SCOPE_MASK)) - return false; + goto admin; /* * Only allow I/O commands that transfer data to the controller or that * change the logical block contents if the file descriptor is open for * writing. */ - if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) - return open_for_write; + if ((nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) && + !open_for_write) + goto admin; + return true; +admin: + return capable(CAP_SYS_ADMIN); } /* @@ -96,54 +98,6 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval) return (void __user *)ptrval; } -static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, - unsigned len, u32 seed) -{ - struct bio_integrity_payload *bip; - int ret = -ENOMEM; - void *buf; - struct bio *bio = req->bio; - - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - goto out; - - ret = -EFAULT; - if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len)) - goto out_free_meta; - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_sector = seed; - ret = bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)); - if (ret != len) { - ret = -ENOMEM; - goto out_free_meta; - } - - req->cmd_flags |= REQ_INTEGRITY; - return buf; -out_free_meta: - kfree(buf); -out: - return ERR_PTR(ret); -} - -static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, - void *meta, unsigned len, int ret) -{ - if (!ret && req_op(req) == REQ_OP_DRV_IN && - copy_to_user(ubuf, meta, len)) - ret = -EFAULT; - kfree(meta); - return ret; -} - static struct request *nvme_alloc_user_request(struct request_queue *q, struct nvme_command *cmd, blk_opf_t rq_flags, blk_mq_req_flags_t blk_flags) @@ -160,47 +114,41 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, - unsigned int flags) + struct iov_iter *iter, unsigned int flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; + bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; + bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; - void *meta = NULL; int ret; - if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { - struct iov_iter iter; - - /* fixedbufs is only for non-vectored io */ - if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) + if (!nvme_ctrl_sgl_supported(ctrl)) + dev_warn_once(ctrl->device, "using unchecked data buffer\n"); + if (has_metadata) { + if (!supports_metadata) return -EINVAL; - ret = io_uring_cmd_import_fixed(ubuffer, bufflen, - rq_data_dir(req), &iter, ioucmd); - if (ret < 0) - goto out; - ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); - } else { + + if (!nvme_ctrl_meta_sgl_supported(ctrl)) + dev_warn_once(ctrl->device, + "using unchecked metadata buffer\n"); + } + + if (iter) + ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL); + else ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 0, rq_data_dir(req)); - } - if (ret) - goto out; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - - if (bdev && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(req, meta_buffer, meta_len, - meta_seed); - if (IS_ERR(meta)) { - ret = PTR_ERR(meta); + return ret; + + if (has_metadata) { + ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len); + if (ret) goto out_unmap; - } - *metap = meta; } return ret; @@ -208,20 +156,17 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) blk_rq_unmap_user(bio); -out: - blk_mq_free_request(req); return ret; } static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + void __user *meta_buffer, unsigned meta_len, u64 *result, unsigned timeout, unsigned int flags) { struct nvme_ns *ns = q->queuedata; struct nvme_ctrl *ctrl; struct request *req; - void *meta = NULL; struct bio *bio; u32 effects; int ret; @@ -233,9 +178,9 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, meta_seed, &meta, NULL, flags); + meta_len, NULL, flags); if (ret) - return ret; + goto out_free_req; } bio = req->bio; @@ -245,16 +190,16 @@ static int nvme_submit_user_cmd(struct request_queue *q, ret = nvme_execute_rq(req, false); if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta) - ret = nvme_finish_user_metadata(req, meta_buffer, meta, - meta_len, ret); if (bio) blk_rq_unmap_user(bio); blk_mq_free_request(req); if (effects) nvme_passthru_end(ctrl, ns, effects, cmd, ret); + return ret; +out_free_req: + blk_mq_free_request(req); return ret; } @@ -279,10 +224,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return -EINVAL; } - length = (io.nblocks + 1) << ns->lba_shift; + length = (io.nblocks + 1) << ns->head->lba_shift; if ((io.control & NVME_RW_PRINFO_PRACT) && - ns->ms == sizeof(struct t10_pi_tuple)) { + (ns->head->ms == ns->head->pi_size)) { /* * Protection information is stripped/inserted by the * controller. @@ -292,11 +237,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) meta_len = 0; metadata = NULL; } else { - meta_len = (io.nblocks + 1) * ns->ms; + meta_len = (io.nblocks + 1) * ns->head->ms; metadata = nvme_to_user_ptr(io.metadata); } - if (ns->features & NVME_NS_EXT_LBAS) { + if (ns->head->features & NVME_NS_EXT_LBAS) { length += meta_len; meta_len = 0; } else if (meta_len) { @@ -313,11 +258,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.control = cpu_to_le16(io.control); c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); + c.rw.lbat = cpu_to_le16(io.apptag); + c.rw.lbatm = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, - meta_len, lower_32_bits(io.slba), NULL, 0, 0); + meta_len, NULL, 0, 0); } static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, @@ -325,8 +270,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, { if (ns && nsid != ns->head->ns_id) { dev_err(ctrl->device, - "%s: nsid (%u) in cmd does not match nsid (%u)" - "of namespace\n", + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", current->comm, nsid, ns->head->ns_id); return false; } @@ -372,7 +316,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), - cmd.metadata_len, 0, &result, timeout, 0); + cmd.metadata_len, &result, timeout, 0); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -419,7 +363,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), - cmd.metadata_len, 0, &cmd.result, timeout, flags); + cmd.metadata_len, &cmd.result, timeout, flags); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -442,61 +386,27 @@ struct nvme_uring_data { * Expect build errors if this grows larger than that. */ struct nvme_uring_cmd_pdu { - union { - struct bio *bio; - struct request *req; - }; - u32 meta_len; - u32 nvme_status; - union { - struct { - void *meta; /* kernel-resident buffer */ - void __user *meta_buffer; - }; - u64 result; - } u; + struct request *req; + struct bio *bio; + u64 result; + int status; }; static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( struct io_uring_cmd *ioucmd) { - return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; -} - -static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, - unsigned issue_flags) -{ - struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - struct request *req = pdu->req; - int status; - u64 result; - - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - status = -EINTR; - else - status = nvme_req(req)->status; - - result = le64_to_cpu(nvme_req(req)->result.u64); - - if (pdu->meta_len) - status = nvme_finish_user_metadata(req, pdu->u.meta_buffer, - pdu->u.meta, pdu->meta_len, status); - if (req->bio) - blk_rq_unmap_user(req->bio); - blk_mq_free_request(req); - - io_uring_cmd_done(ioucmd, status, result, issue_flags); + return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu); } -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, - unsigned issue_flags) +static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) blk_rq_unmap_user(pdu->bio); - - io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, @@ -505,48 +415,25 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - req->bio = pdu->bio; - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - pdu->nvme_status = -EINTR; - else - pdu->nvme_status = nvme_req(req)->status; - pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); - - /* - * For iopoll, complete it directly. - * Otherwise, move the completion to task work. - */ - if (blk_rq_is_poll(req)) { - WRITE_ONCE(ioucmd->cookie, NULL); - nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) { + pdu->status = -EINTR; } else { - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + pdu->status = nvme_req(req)->status; + if (!pdu->status) + pdu->status = blk_status_to_errno(err); } - - return RQ_END_IO_FREE; -} - -static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, - blk_status_t err) -{ - struct io_uring_cmd *ioucmd = req->end_io_data; - struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - - req->bio = pdu->bio; - pdu->req = req; + pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * For iopoll, complete it directly. - * Otherwise, move the completion to task work. + * IOPOLL could potentially complete this request directly, but + * if multiple rings are polling on the same queue, then it's possible + * for one ring to find completions for another ring. Punting the + * completion via task_work will always direct it to the right + * location, rather than potentially complete requests for ringA + * under iopoll invocations from ringB. */ - if (blk_rq_is_poll(req)) { - WRITE_ONCE(ioucmd->cookie, NULL); - nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); - } else { - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); - } - - return RQ_END_IO_NONE; + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + return RQ_END_IO_FREE; } static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -557,10 +444,11 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct nvme_uring_data d; struct nvme_command c; + struct iov_iter iter; + struct iov_iter *map_iter = NULL; struct request *req; - blk_opf_t rq_flags = REQ_ALLOC_CACHE; + blk_opf_t rq_flags = 0; blk_mq_req_flags_t blk_flags = 0; - void *meta = NULL; int ret; c.common.opcode = READ_ONCE(cmd->opcode); @@ -593,6 +481,22 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, d.metadata_len = READ_ONCE(cmd->metadata_len); d.timeout_ms = READ_ONCE(cmd->timeout_ms); + if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) { + int ddir = nvme_is_write(&c) ? WRITE : READ; + + if (vec) + ret = io_uring_cmd_import_fixed_vec(ioucmd, + u64_to_user_ptr(d.addr), d.data_len, + ddir, &iter, issue_flags); + else + ret = io_uring_cmd_import_fixed(d.addr, d.data_len, + ddir, &iter, ioucmd, issue_flags); + if (ret < 0) + return ret; + + map_iter = &iter; + } + if (issue_flags & IO_URING_F_NONBLOCK) { rq_flags |= REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; @@ -605,32 +509,25 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return PTR_ERR(req); req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; - if (d.addr && d.data_len) { - ret = nvme_map_user_request(req, d.addr, - d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, 0, &meta, ioucmd, vec); + if (d.data_len) { + ret = nvme_map_user_request(req, d.addr, d.data_len, + nvme_to_user_ptr(d.metadata), d.metadata_len, + map_iter, vec ? NVME_IOCTL_VEC : 0); if (ret) - return ret; - } - - if (blk_rq_is_poll(req)) { - ioucmd->flags |= IORING_URING_CMD_POLLED; - WRITE_ONCE(ioucmd->cookie, req); + goto out_free_req; } /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; - pdu->meta_len = d.metadata_len; + pdu->req = req; req->end_io_data = ioucmd; - if (pdu->meta_len) { - pdu->u.meta = meta; - pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata); - req->end_io = nvme_uring_cmd_end_io_meta; - } else { - req->end_io = nvme_uring_cmd_end_io; - } + req->end_io = nvme_uring_cmd_end_io; blk_execute_rq_nowait(req, false); return -EIOCBQUEUED; + +out_free_req: + blk_mq_free_request(req); + return ret; } static bool is_ctrl_ioctl(unsigned int cmd) @@ -747,8 +644,6 @@ static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, struct nvme_ctrl *ctrl = ns->ctrl; int ret; - BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); - ret = nvme_uring_cmd_checks(issue_flags); if (ret) return ret; @@ -779,16 +674,12 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags) { - struct request *req; - int ret = 0; - - if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) - return 0; + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + struct request *req = pdu->req; - req = READ_ONCE(ioucmd->cookie); if (req && blk_rq_is_poll(req)) - ret = blk_rq_poll(req, iob, poll_flags); - return ret; + return blk_rq_poll(req, iob, poll_flags); + return 0; } #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, @@ -827,7 +718,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, /* * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a + * separately and drop the ns SRCU reference early. This avoids a * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) @@ -913,15 +804,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, bool open_for_write) { struct nvme_ns *ns; - int ret; + int ret, srcu_idx; - down_read(&ctrl->namespaces_rwsem); + srcu_idx = srcu_read_lock(&ctrl->srcu); if (list_empty(&ctrl->namespaces)) { ret = -ENOTTY; goto out_unlock; } - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); @@ -931,15 +822,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); + if (!nvme_get_ns(ns)) { + ret = -ENXIO; + goto out_unlock; + } + srcu_read_unlock(&ctrl->srcu, srcu_idx); ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); nvme_put_ns(ns); return ret; out_unlock: - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } |
