diff options
Diffstat (limited to 'drivers/block/ublk_drv.c')
| -rw-r--r-- | drivers/block/ublk_drv.c | 3410 |
1 files changed, 2575 insertions, 835 deletions
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f96cb01e9604..2c715df63f23 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -36,35 +36,76 @@ #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/cdev.h> -#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <linux/blk-mq.h> #include <linux/delay.h> #include <linux/mm.h> #include <asm/page.h> #include <linux/task_work.h> +#include <linux/namei.h> +#include <linux/kref.h> #include <uapi/linux/ublk_cmd.h> #define UBLK_MINORS (1U << MINORBITS) +#define UBLK_INVALID_BUF_IDX ((u16)-1) + +/* private ioctl command mirror */ +#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) +#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) + +#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) +#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ | UBLK_F_NEED_GET_DATA \ | UBLK_F_USER_RECOVERY \ - | UBLK_F_USER_RECOVERY_REISSUE) + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_UNPRIVILEGED_DEV \ + | UBLK_F_CMD_IOCTL_ENCODE \ + | UBLK_F_USER_COPY \ + | UBLK_F_ZONED \ + | UBLK_F_USER_RECOVERY_FAIL_IO \ + | UBLK_F_UPDATE_SIZE \ + | UBLK_F_AUTO_BUF_REG \ + | UBLK_F_QUIESCE \ + | UBLK_F_PER_IO_DAEMON \ + | UBLK_F_BUF_REG_OFF_DAEMON) + +#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_USER_RECOVERY_FAIL_IO) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) +#define UBLK_PARAM_TYPE_ALL \ + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) -struct ublk_rq_data { +struct ublk_uring_cmd_pdu { + /* + * Store requests in same batch temporarily for queuing them to + * daemon context. + * + * It should have been stored to request payload, but we do want + * to avoid extra pre-allocation, and uring_cmd payload is always + * free for us + */ union { - struct callback_head work; - struct llist_node node; + struct request *req; + struct request *req_list; }; -}; -struct ublk_uring_cmd_pdu { + /* + * The following two are valid in this cmd whole lifetime, and + * setup in ublk uring_cmd handler + */ struct ublk_queue *ubq; + + u16 tag; }; /* @@ -88,15 +129,6 @@ struct ublk_uring_cmd_pdu { #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 /* - * IO command is aborted, so this flag is set in case of - * !UBLK_IO_FLAG_ACTIVE. - * - * After this flag is observed, any pending or new incoming request - * associated with this io command will be failed immediately - */ -#define UBLK_IO_FLAG_ABORTED 0x04 - -/* * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires * get data buffer address from ublksrv. * @@ -105,41 +137,79 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 -struct ublk_io { - /* userspace buffer address from io cmd */ +/* + * request buffer is registered automatically, so we have to unregister it + * before completing this request. + * + * io_uring will unregister buffer automatically for us during exiting. + */ +#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10 + +/* atomic RW with ubq->cancel_lock */ +#define UBLK_IO_FLAG_CANCELED 0x80000000 + +/* + * Initialize refcount to a large number to include any registered buffers. + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for + * any buffers registered on the io daemon task. + */ +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) + +union ublk_io_buf { __u64 addr; + struct ublk_auto_buf_reg auto_reg; +}; + +struct ublk_io { + union ublk_io_buf buf; unsigned int flags; int res; - struct io_uring_cmd *cmd; -}; + union { + /* valid if UBLK_IO_FLAG_ACTIVE is set */ + struct io_uring_cmd *cmd; + /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */ + struct request *req; + }; + + struct task_struct *task; + + /* + * The number of uses of this I/O by the ublk server + * if user copy or zero copy are enabled: + * - UBLK_REFCOUNT_INIT from dispatch to the server + * until UBLK_IO_COMMIT_AND_FETCH_REQ + * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each io_uring registered buffer not registered on task + * The I/O can only be completed once all references are dropped. + * User copy and buffer registration operations are only permitted + * if the reference count is nonzero. + */ + refcount_t ref; + /* Count of buffers registered on task and not yet unregistered */ + unsigned task_registered_buffers; + + void *buf_ctx_handle; +} ____cacheline_aligned_in_smp; struct ublk_queue { int q_id; int q_depth; unsigned long flags; - struct task_struct *ubq_daemon; - char *io_cmd_buf; - - struct llist_head io_cmds; + struct ublksrv_io_desc *io_cmd_buf; - unsigned long io_addr; /* mapped vm address */ - unsigned int max_io_sz; bool force_abort; - unsigned short nr_io_ready; /* how many ios setup */ + bool canceling; + bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ + spinlock_t cancel_lock; struct ublk_device *dev; - struct ublk_io ios[]; + struct ublk_io ios[] __counted_by(q_depth); }; -#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) - struct ublk_device { struct gendisk *ub_disk; - char *__queues; - - unsigned short queue_size; struct ublksrv_ctrl_dev_info dev_info; struct blk_mq_tag_set tag_set; @@ -149,27 +219,26 @@ struct ublk_device { #define UB_STATE_OPEN 0 #define UB_STATE_USED 1 +#define UB_STATE_DELETED 2 unsigned long state; int ub_number; struct mutex mutex; - spinlock_t mm_lock; + spinlock_t lock; struct mm_struct *mm; struct ublk_params params; struct completion completion; - unsigned int nr_queues_ready; - atomic_t nr_aborted_queues; - - /* - * Our ubq->daemon may be killed without any notification, so - * monitor each queue's daemon periodically - */ - struct delayed_work monitor_work; - struct work_struct quiesce_work; - struct work_struct stop_work; + u32 nr_io_ready; + bool unprivileged_daemons; + struct mutex cancel_mutex; + bool canceling; + pid_t ublksrv_tgid; + struct delayed_work exit_work; + + struct ublk_queue *queues[]; }; /* header of ublk_params */ @@ -178,8 +247,296 @@ struct ublk_params_header { __u32 types; }; +static void ublk_io_release(void *priv); +static void ublk_stop_dev_unlocked(struct ublk_device *ub); +static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + u16 q_id, u16 tag, struct ublk_io *io, size_t offset); +static inline unsigned int ublk_req_build_flags(struct request *req); + +static inline struct ublksrv_io_desc * +ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) +{ + return &ubq->io_cmd_buf[tag]; +} + +static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_ZONED; +} + +static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_ZONED; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + +static int ublk_get_nr_zones(const struct ublk_device *ub) +{ + const struct ublk_param_basic *p = &ub->params.basic; + + /* Zone size is a power of 2 */ + return p->dev_sectors >> ilog2(p->chunk_sectors); +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return blk_revalidate_disk_zones(ub->ub_disk); +} + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + int nr_zones; + + if (!ublk_dev_is_zoned(ub)) + return -EINVAL; + + if (!p->max_zone_append_sectors) + return -EINVAL; + + nr_zones = ublk_get_nr_zones(ub); + + if (p->max_active_zones > nr_zones) + return -EINVAL; + + if (p->max_open_zones > nr_zones) + return -EINVAL; + + return 0; +} + +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); +} + +/* Based on virtblk_alloc_report_buffer */ +static void *ublk_alloc_report_buffer(struct ublk_device *ublk, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ublk->ub_disk->queue; + size_t bufsize; + void *buf; + + nr_zones = min_t(unsigned int, nr_zones, + ublk->ub_disk->nr_zones); + + bufsize = nr_zones * sizeof(struct blk_zone); + bufsize = + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); + + while (bufsize >= sizeof(struct blk_zone)) { + buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + + *buflen = 0; + return NULL; +} + +static int ublk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, struct blk_report_zones_args *args) +{ + struct ublk_device *ub = disk->private_data; + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; + unsigned int first_zone = sector >> ilog2(zone_size_sectors); + unsigned int done_zones = 0; + unsigned int max_zones_per_request; + int ret; + struct blk_zone *buffer; + size_t buffer_length; + + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, + nr_zones); + + buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); + if (!buffer) + return -ENOMEM; + + max_zones_per_request = buffer_length / sizeof(struct blk_zone); + + while (done_zones < nr_zones) { + unsigned int remaining_zones = nr_zones - done_zones; + unsigned int zones_in_request = + min_t(unsigned int, remaining_zones, max_zones_per_request); + struct request *req; + struct ublk_zoned_report_desc desc; + blk_status_t status; + + memset(buffer, 0, buffer_length); + + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; + + ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); + if (ret) + goto erase_desc; + + status = blk_execute_rq(req, 0); + ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: + blk_mq_free_request(req); + if (ret) + goto out; + + for (unsigned int i = 0; i < zones_in_request; i++) { + struct blk_zone *zone = buffer + i; + + /* A zero length zone means no more zones in this response */ + if (!zone->len) + break; + + ret = disk_report_zone(disk, zone, i, args); + if (ret) + goto out; + + done_zones++; + sector += zone_size_sectors; + + } + } + + ret = done_zones; + +out: + kvfree(buffer); + return ret; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + struct ublk_zoned_report_desc *desc; + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_ZONE_OPEN: + ublk_op = UBLK_IO_OP_ZONE_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + ublk_op = UBLK_IO_OP_ZONE_CLOSE; + break; + case REQ_OP_ZONE_FINISH: + ublk_op = UBLK_IO_OP_ZONE_FINISH; + break; + case REQ_OP_ZONE_RESET: + ublk_op = UBLK_IO_OP_ZONE_RESET; + break; + case REQ_OP_ZONE_APPEND: + ublk_op = UBLK_IO_OP_ZONE_APPEND; + break; + case REQ_OP_ZONE_RESET_ALL: + ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; + break; + case REQ_OP_DRV_IN: + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; + switch (ublk_op) { + case UBLK_IO_OP_REPORT_ZONES: + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; + return BLK_STS_OK; + default: + return BLK_STS_IOERR; + } + case REQ_OP_DRV_OUT: + /* We do not support drv_out */ + return BLK_STS_NOTSUPP; + default: + return BLK_STS_IOERR; + } + + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->buf.addr; + + return BLK_STS_OK; +} + +#else + +#define ublk_report_zones (NULL) + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return 0; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + return BLK_STS_NOTSUPP; +} + +#endif + +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, + bool need_map); + static dev_t ublk_chr_devt; -static struct class *ublk_chr_class; +static const struct class ublk_chr_class = { + .name = "ublk-char", +}; static DEFINE_IDR(ublk_index_idr); static DEFINE_SPINLOCK(ublk_idr_lock); @@ -187,28 +544,40 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); + +#define UBLK_MAX_UBLKS UBLK_MINORS + +/* + * Max unprivileged ublk devices allowed to add + * + * It can be extended to one per-user limit in future or even controlled + * by cgroup. + */ +static unsigned int unprivileged_ublks_max = 64; +static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */ + static struct miscdevice ublk_misc; -static void ublk_dev_param_basic_apply(struct ublk_device *ub) +static inline unsigned ublk_pos_to_hwq(loff_t pos) { - struct request_queue *q = ub->ub_disk->queue; - const struct ublk_param_basic *p = &ub->params.basic; + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & + UBLK_QID_BITS_MASK; +} - blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); - blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); - blk_queue_io_min(q, 1 << p->io_min_shift); - blk_queue_io_opt(q, 1 << p->io_opt_shift); +static inline unsigned ublk_pos_to_buf_off(loff_t pos) +{ + return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; +} - blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, - p->attrs & UBLK_ATTR_FUA); - if (p->attrs & UBLK_ATTR_ROTATIONAL) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); +static inline unsigned ublk_pos_to_tag(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & + UBLK_TAG_BITS_MASK; +} - blk_queue_max_hw_sectors(q, p->max_sectors); - blk_queue_chunk_sectors(q, p->chunk_sectors); - blk_queue_virt_boundary(q, p->virt_boundary_mask); +static void ublk_dev_param_basic_apply(struct ublk_device *ub) +{ + const struct ublk_param_basic *p = &ub->params.basic; if (p->attrs & UBLK_ATTR_READ_ONLY) set_disk_ro(ub->ub_disk, true); @@ -216,26 +585,13 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) set_capacity(ub->ub_disk, p->dev_sectors); } -static void ublk_dev_param_discard_apply(struct ublk_device *ub) -{ - struct request_queue *q = ub->ub_disk->queue; - const struct ublk_param_discard *p = &ub->params.discard; - - q->limits.discard_alignment = p->discard_alignment; - q->limits.discard_granularity = p->discard_granularity; - blk_queue_max_discard_sectors(q, p->max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, - p->max_write_zeroes_sectors); - blk_queue_max_discard_segments(q, p->max_discard_segments); -} - static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { const struct ublk_param_basic *p = &ub->params.basic; - if (p->logical_bs_shift > PAGE_SHIFT) + if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL; if (p->logical_bs_shift > p->physical_bs_shift) @@ -243,6 +599,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) + return -EINVAL; } else return -EINVAL; @@ -257,45 +616,164 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + /* dev_t is read-only */ + if (ub->params.types & UBLK_PARAM_TYPE_DEVT) + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_validate(ub); + else if (ublk_dev_is_zoned(ub)) + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { + const struct ublk_param_dma_align *p = &ub->params.dma; + + if (p->alignment >= PAGE_SIZE) + return -EINVAL; + + if (!is_power_of_2(p->alignment + 1)) + return -EINVAL; + } + + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + const struct ublk_param_segment *p = &ub->params.seg; + + if (!is_power_of_2(p->seg_boundary_mask + 1)) + return -EINVAL; + + if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + } + return 0; } -static int ublk_apply_params(struct ublk_device *ub) +static void ublk_apply_params(struct ublk_device *ub) { - if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) - return -EINVAL; - ublk_dev_param_basic_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) - ublk_dev_param_discard_apply(ub); + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + ublk_dev_param_zoned_apply(ub); +} - return 0; +static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; } -static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) +static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) { - if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && - !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) - return true; - return false; + return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_need_map_io(const struct ublk_queue *ubq) +{ + return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && + !ublk_support_auto_buf_reg(ubq); +} + +static inline bool ublk_dev_need_map_io(const struct ublk_device *ub) +{ + return !ublk_dev_support_user_copy(ub) && + !ublk_dev_support_zero_copy(ub) && + !ublk_dev_support_auto_buf_reg(ub); +} + +static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) +{ + /* + * read()/write() is involved in user copy, so request reference + * has to be grabbed + * + * for zero copy, request buffer need to be registered to io_uring + * buffer table, so reference is needed + * + * For auto buffer register, ublk server still may issue + * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up, + * so reference is required too. + */ + return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) || + ublk_support_auto_buf_reg(ubq); +} + +static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) +{ + return ublk_dev_support_user_copy(ub) || + ublk_dev_support_zero_copy(ub) || + ublk_dev_support_auto_buf_reg(ub); +} + +static inline void ublk_init_req_ref(const struct ublk_queue *ubq, + struct ublk_io *io) +{ + if (ublk_need_req_ref(ubq)) + refcount_set(&io->ref, UBLK_REFCOUNT_INIT); +} + +static inline bool ublk_get_req_ref(struct ublk_io *io) +{ + return refcount_inc_not_zero(&io->ref); +} + +static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) +{ + if (!refcount_dec_and_test(&io->ref)) + return; + + /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ + __ublk_complete_rq(req, io, false); +} + +static inline bool ublk_sub_req_ref(struct ublk_io *io) +{ + unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers; + + io->task_registered_buffers = 0; + return refcount_sub_and_test(sub_refs, &io->ref); } static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { - if (ubq->flags & UBLK_F_NEED_GET_DATA) - return true; - return false; + return ubq->flags & UBLK_F_NEED_GET_DATA; } -static struct ublk_device *ublk_get_device(struct ublk_device *ub) +static inline bool ublk_dev_need_get_data(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_NEED_GET_DATA; +} + +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) { if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) return ub; return NULL; } -static void ublk_put_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline void ublk_put_device(struct ublk_device *ub) { put_device(&ub->cdev_dev); } @@ -303,56 +781,81 @@ static void ublk_put_device(struct ublk_device *ub) static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, int qid) { - return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); + return dev->queues[qid]; } static inline bool ublk_rq_has_data(const struct request *rq) { - return rq->bio && bio_has_data(rq->bio); + return bio_has_data(rq->bio); } -static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, - int tag) +static inline struct ublksrv_io_desc * +ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) { - return (struct ublksrv_io_desc *) - &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); + return ublk_get_queue(ub, q_id)->io_cmd_buf; } -static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) +static inline int __ublk_queue_cmd_buf_size(int depth) { - return ublk_get_queue(ub, q_id)->io_cmd_buf; + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); } -static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) +static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth); +} - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), - PAGE_SIZE); +static int ublk_max_cmd_buf_size(void) +{ + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); } -static inline bool ublk_queue_can_use_recovery_reissue( - struct ublk_queue *ubq) +/* + * Should I/O outstanding to the ublk server when it exits be reissued? + * If not, outstanding I/O will get errors. + */ +static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) { - if ((ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) - return true; - return false; + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); } -static inline bool ublk_queue_can_use_recovery( - struct ublk_queue *ubq) +/* + * Should I/O issued while there is no ublk server queue? If not, I/O + * issued while there is no ublk server will get errors. + */ +static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) { - if (ubq->flags & UBLK_F_USER_RECOVERY) - return true; - return false; + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); } -static inline bool ublk_can_use_recovery(struct ublk_device *ub) +/* + * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy + * of the device flags for smaller cache footprint - better for fast + * paths. + */ +static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) { - if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) - return true; - return false; + return (ubq->flags & UBLK_F_USER_RECOVERY) && + !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); +} + +/* + * Should ublk devices be stopped (i.e. no recovery possible) when the + * ublk server exits? If not, devices can be used again by a future + * incarnation of a ublk server via the start_recovery/end_recovery + * commands. + */ +static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) +{ + return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); +} + +static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) +{ + return ub->dev_info.state == UBLK_S_DEV_QUIESCED || + ub->dev_info.state == UBLK_S_DEV_FAIL_IO; } static void ublk_free_disk(struct gendisk *disk) @@ -360,160 +863,148 @@ static void ublk_free_disk(struct gendisk *disk) struct ublk_device *ub = disk->private_data; clear_bit(UB_STATE_USED, &ub->state); - put_device(&ub->cdev_dev); + ublk_put_device(ub); } -static const struct block_device_operations ub_fops = { - .owner = THIS_MODULE, - .free_disk = ublk_free_disk, -}; +static void ublk_store_owner_uid_gid(unsigned int *owner_uid, + unsigned int *owner_gid) +{ + kuid_t uid; + kgid_t gid; -#define UBLK_MAX_PIN_PAGES 32 + current_uid_gid(&uid, &gid); -struct ublk_map_data { - const struct ublk_queue *ubq; - const struct request *rq; - const struct ublk_io *io; - unsigned max_bytes; -}; + *owner_uid = from_kuid(&init_user_ns, uid); + *owner_gid = from_kgid(&init_user_ns, gid); +} -struct ublk_io_iter { - struct page *pages[UBLK_MAX_PIN_PAGES]; - unsigned pg_off; /* offset in the 1st page in pages */ - int nr_pages; /* how many page pointers in pages */ - struct bio *bio; - struct bvec_iter iter; +static int ublk_open(struct gendisk *disk, blk_mode_t mode) +{ + struct ublk_device *ub = disk->private_data; + + if (capable(CAP_SYS_ADMIN)) + return 0; + + /* + * If it is one unprivileged device, only owner can open + * the disk. Otherwise it could be one trap made by one + * evil user who grants this disk's privileges to other + * users deliberately. + * + * This way is reasonable too given anyone can create + * unprivileged device, and no need other's grant. + */ + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { + unsigned int curr_uid, curr_gid; + + ublk_store_owner_uid_gid(&curr_uid, &curr_gid); + + if (curr_uid != ub->dev_info.owner_uid || curr_gid != + ub->dev_info.owner_gid) + return -EPERM; + } + + return 0; +} + +static const struct block_device_operations ub_fops = { + .owner = THIS_MODULE, + .open = ublk_open, + .free_disk = ublk_free_disk, + .report_zones = ublk_report_zones, }; -static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, - unsigned max_bytes, bool to_vm) +/* + * Copy data between request pages and io_iter, and 'offset' + * is the start point of linear offset of request. + */ +static size_t ublk_copy_user_pages(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) { - const unsigned total = min_t(unsigned, max_bytes, - PAGE_SIZE - data->pg_off + - ((data->nr_pages - 1) << PAGE_SHIFT)); - unsigned done = 0; - unsigned pg_idx = 0; + struct req_iterator iter; + struct bio_vec bv; + size_t done = 0; + + rq_for_each_segment(bv, req, iter) { + void *bv_buf; + size_t copied; - while (done < total) { - struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - const unsigned int bytes = min3(bv.bv_len, total - done, - (unsigned)(PAGE_SIZE - data->pg_off)); - void *bv_buf = bvec_kmap_local(&bv); - void *pg_buf = kmap_local_page(data->pages[pg_idx]); + if (offset >= bv.bv_len) { + offset -= bv.bv_len; + continue; + } - if (to_vm) - memcpy(pg_buf + data->pg_off, bv_buf, bytes); + bv.bv_offset += offset; + bv.bv_len -= offset; + bv_buf = bvec_kmap_local(&bv); + if (dir == ITER_DEST) + copied = copy_to_iter(bv_buf, bv.bv_len, uiter); else - memcpy(bv_buf, pg_buf + data->pg_off, bytes); + copied = copy_from_iter(bv_buf, bv.bv_len, uiter); - kunmap_local(pg_buf); kunmap_local(bv_buf); - /* advance page array */ - data->pg_off += bytes; - if (data->pg_off == PAGE_SIZE) { - pg_idx += 1; - data->pg_off = 0; - } - - done += bytes; + done += copied; + if (copied < bv.bv_len) + break; - /* advance bio */ - bio_advance_iter_single(data->bio, &data->iter, bytes); - if (!data->iter.bi_size) { - data->bio = data->bio->bi_next; - if (data->bio == NULL) - break; - data->iter = data->bio->bi_iter; - } + offset = 0; } - return done; } -static inline int ublk_copy_user_pages(struct ublk_map_data *data, - bool to_vm) +static inline bool ublk_need_map_req(const struct request *req) { - const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->io->addr; - unsigned int done = 0; - struct ublk_io_iter iter = { - .pg_off = start_vm & (PAGE_SIZE - 1), - .bio = data->rq->bio, - .iter = data->rq->bio->bi_iter, - }; - const unsigned int nr_pages = round_up(data->max_bytes + - (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; - - while (done < nr_pages) { - const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, - nr_pages - done); - unsigned i, len; - - iter.nr_pages = get_user_pages_fast(start_vm + - (done << PAGE_SHIFT), to_pin, gup_flags, - iter.pages); - if (iter.nr_pages <= 0) - return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); - for (i = 0; i < iter.nr_pages; i++) { - if (to_vm) - set_page_dirty(iter.pages[i]); - put_page(iter.pages[i]); - } - data->max_bytes -= len; - done += iter.nr_pages; - } + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; +} - return done; +static inline bool ublk_need_unmap_req(const struct request *req) +{ + return ublk_rq_has_data(req) && + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); } -static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) +static unsigned int ublk_map_io(const struct ublk_queue *ubq, + const struct request *req, + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); + + if (!ublk_need_map_io(ubq)) + return rq_bytes; + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages */ - if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) - return rq_bytes; - - if (ublk_rq_has_data(req)) { - struct ublk_map_data data = { - .ubq = ubq, - .rq = req, - .io = io, - .max_bytes = rq_bytes, - }; + if (ublk_need_map_req(req)) { + struct iov_iter iter; + const int dir = ITER_DEST; - ublk_copy_user_pages(&data, true); - - return rq_bytes - data.max_bytes; + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } -static int ublk_unmap_io(const struct ublk_queue *ubq, +static unsigned int ublk_unmap_io(bool need_map, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); - if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { - struct ublk_map_data data = { - .ubq = ubq, - .rq = req, - .io = io, - .max_bytes = io->res, - }; + if (!need_map) + return rq_bytes; - WARN_ON_ONCE(io->res > rq_bytes); + if (ublk_need_unmap_req(req)) { + struct iov_iter iter; + const int dir = ITER_SOURCE; - ublk_copy_user_pages(&data, false); + WARN_ON_ONCE(io->res > rq_bytes); - return io->res - data.max_bytes; + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } @@ -569,6 +1060,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) ublk_op = UBLK_IO_OP_WRITE_ZEROES; break; default: + if (ublk_queue_is_zoned(ubq)) + return ublk_setup_iod_zoned(ubq, req); return BLK_STS_IOERR; } @@ -576,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -584,43 +1077,37 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( struct io_uring_cmd *ioucmd) { - return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; -} - -static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) -{ - return ubq->ubq_daemon->flags & PF_EXITING; + return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } /* todo: handle partial completion */ -static void ublk_complete_rq(struct request *req) +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, + bool need_map) { - struct ublk_queue *ubq = req->mq_hctx->driver_data; - struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; + blk_status_t res = BLK_STS_OK; /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; if (io->res < 0) { - blk_mq_end_request(req, errno_to_blk_status(io->res)); - return; + res = errno_to_blk_status(io->res); + goto exit; } /* - * FLUSH or DISCARD usually won't return bytes returned, so end them + * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them * directly. * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { - blk_mq_end_request(req, BLK_STS_OK); - return; - } + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && + req_op(req) != REQ_OP_DRV_IN) + goto exit; /* for READ request, writing data in iod->addr to rq buffers */ - unmapped_bytes = ublk_unmap_io(ubq, req, io); + unmapped_bytes = ublk_unmap_io(need_map, req, io); /* * Extremely impossible since we got data filled in just before @@ -632,34 +1119,20 @@ static void ublk_complete_rq(struct request *req) if (blk_update_request(req, BLK_STS_OK, io->res)) blk_mq_requeue_request(req, true); - else + else if (likely(!blk_should_fake_timeout(req->q))) __blk_mq_end_request(req, BLK_STS_OK); -} - -/* - * Since __ublk_rq_task_work always fails requests immediately during - * exiting, __ublk_fail_req() is only called from abort context during - * exiting. So lock is unnecessary. - * - * Also aborting may not be started yet, keep in mind that one failed - * request may be issued by block layer again. - */ -static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, - struct request *req) -{ - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { - io->flags |= UBLK_IO_FLAG_ABORTED; - if (ublk_queue_can_use_recovery_reissue(ubq)) - blk_mq_requeue_request(req, false); - else - blk_mq_end_request(req, BLK_STS_IOERR); - } + return; +exit: + blk_mq_end_request(req, res); } -static void ubq_complete_io_cmd(struct ublk_io *io, int res) +static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, + struct request *req) { + /* read cmd first because req will overwrite it */ + struct io_uring_cmd *cmd = io->cmd; + /* mark this cmd owned by ublksrv */ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; @@ -669,8 +1142,17 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res) */ io->flags &= ~UBLK_IO_FLAG_ACTIVE; + io->req = req; + return cmd; +} + +static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, + int res, unsigned issue_flags) +{ + struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); + /* tell ublksrv one io request is coming */ - io_uring_cmd_done(io->cmd, res, 0); + io_uring_cmd_done(cmd, res, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -679,69 +1161,78 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, struct request *rq) { /* We cannot process this rq so just requeue it. */ - if (ublk_queue_can_use_recovery(ubq)) + if (ublk_nosrv_dev_should_queue_io(ubq->dev)) blk_mq_requeue_request(rq, false); else blk_mq_end_request(rq, BLK_STS_IOERR); - - mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); } -static inline void __ublk_rq_task_work(struct request *req) +static void +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag) { - struct ublk_queue *ubq = req->mq_hctx->driver_data; - int tag = req->tag; - struct ublk_io *io = &ubq->ios[tag]; - unsigned int mapped_bytes; + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); - pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", - __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, - ublk_get_iod(ubq, req->tag)->addr); + iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; +} - /* - * Task is exiting if either: - * - * (1) current != ubq_daemon. - * io_uring_cmd_complete_in_task() tries to run task_work - * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. - * - * (2) current->flags & PF_EXITING. - */ - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { - __ublk_abort_rq(ubq, req); - return; +enum auto_buf_reg_res { + AUTO_BUF_REG_FAIL, + AUTO_BUF_REG_FALLBACK, + AUTO_BUF_REG_OK, +}; + +static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + enum auto_buf_reg_res res) +{ + if (res == AUTO_BUF_REG_OK) { + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); + io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; } + ublk_init_req_ref(ubq, io); + __ublk_prep_compl_io_cmd(io, req); +} - if (ublk_need_get_data(ubq) && - (req_op(req) == REQ_OP_WRITE || - req_op(req) == REQ_OP_FLUSH)) { - /* - * We have not handled UBLK_IO_NEED_GET_DATA command yet, - * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv - * and notify it. - */ - if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { - io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; - pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", - __func__, io->cmd->cmd_op, ubq->q_id, - req->tag, io->flags); - ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA); - return; +static enum auto_buf_reg_res +__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + int ret; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, + io->buf.auto_reg.index, issue_flags); + if (ret) { + if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + ublk_auto_buf_reg_fallback(ubq, req->tag); + return AUTO_BUF_REG_FALLBACK; } - /* - * We have handled UBLK_IO_NEED_GET_DATA command, - * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just - * do the copy work. - */ - io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; - /* update iod->addr because ublksrv may have passed a new io buffer */ - ublk_get_iod(ubq, req->tag)->addr = io->addr; - pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n", - __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, - ublk_get_iod(ubq, req->tag)->addr); + blk_mq_end_request(req, BLK_STS_IOERR); + return AUTO_BUF_REG_FAIL; + } + + return AUTO_BUF_REG_OK; +} + +static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + issue_flags); + + if (res != AUTO_BUF_REG_FAIL) { + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); } +} - mapped_bytes = ublk_map_io(ubq, req, io); +static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io) +{ + unsigned mapped_bytes = ublk_map_io(ubq, req, io); /* partially mapped, update io descriptor */ if (unlikely(mapped_bytes != blk_rq_bytes(req))) { @@ -756,95 +1247,137 @@ static inline void __ublk_rq_task_work(struct request *req) blk_mq_requeue_request(req, false); blk_mq_delay_kick_requeue_list(req->q, UBLK_REQUEUE_DELAY_MS); - return; + return false; } ublk_get_iod(ubq, req->tag)->nr_sectors = mapped_bytes >> 9; } - ubq_complete_io_cmd(io, UBLK_IO_RES_OK); + return true; +} + +static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) +{ + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + int tag = req->tag; + struct ublk_io *io = &ubq->ios[tag]; + + pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n", + __func__, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + + /* + * Task is exiting if either: + * + * (1) current != io->task. + * io_uring_cmd_complete_in_task() tries to run task_work + * in a workqueue if cmd's task is PF_EXITING. + * + * (2) current->flags & PF_EXITING. + */ + if (unlikely(current != io->task || current->flags & PF_EXITING)) { + __ublk_abort_rq(ubq, req); + return; + } + + if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { + /* + * We have not handled UBLK_IO_NEED_GET_DATA command yet, + * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv + * and notify it. + */ + io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; + pr_devel("%s: need get data. qid %d tag %d io_flags %x\n", + __func__, ubq->q_id, req->tag, io->flags); + ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA, + issue_flags); + return; + } + + if (!ublk_start_io(ubq, req, io)) + return; + + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags); + } else { + ublk_init_req_ref(ubq, io); + ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); + } } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data; - llist_for_each_entry(data, io_cmds, node) - __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); + ublk_dispatch_req(ubq, pdu->req); } -static void ublk_rq_task_work_fn(struct callback_head *work) +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { - struct ublk_rq_data *data = container_of(work, - struct ublk_rq_data, work); - struct request *req = blk_mq_rq_from_pdu(data); + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - __ublk_rq_task_work(req); + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); } -static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) +static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { - struct ublk_io *io = &ubq->ios[rq->tag]; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct request *rq = pdu->req_list; + struct request *next; - /* - * If the check pass, we know that this is a re-issued request aborted - * previously in monitor_work because the ubq_daemon(cmd's task) is - * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore - * because this ioucmd's io_uring context may be freed now if no inflight - * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. - * - * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing - * the tag). Then the request is re-started(allocating the tag) and we are here. - * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED - * guarantees that here is a re-issued request aborted previously. - */ - if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data; + do { + next = rq->rq_next; + rq->rq_next = NULL; + ublk_dispatch_req(rq->mq_hctx->driver_data, rq); + rq = next; + } while (rq); +} - llist_for_each_entry(data, io_cmds, node) - __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); - } else { - struct io_uring_cmd *cmd = io->cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); +static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) +{ + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - pdu->ubq = ubq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); - } + pdu->req_list = rq_list_peek(l); + rq_list_init(l); + io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); } -static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq, - bool last) +static enum blk_eh_timer_return ublk_timeout(struct request *rq) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + pid_t tgid = ubq->dev->ublksrv_tgid; + struct task_struct *p; + struct pid *pid; - if (ublk_can_use_task_work(ubq)) { - enum task_work_notify_mode notify_mode = last ? - TWA_SIGNAL_NO_IPI : TWA_NONE; + if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV)) + return BLK_EH_RESET_TIMER; - if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) - __ublk_abort_rq(ubq, rq); - } else { - if (llist_add(&data->node, &ubq->io_cmds)) - ublk_submit_cmd(ubq, rq); - } + if (unlikely(!tgid)) + return BLK_EH_RESET_TIMER; + + rcu_read_lock(); + pid = find_vpid(tgid); + p = pid_task(pid, PIDTYPE_PID); + if (p) + send_sig(SIGKILL, p, 0); + rcu_read_unlock(); + return BLK_EH_DONE; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, + bool check_cancel) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; - /* fill iod to slot in io cmd buffer */ - res = ublk_setup_iod(ubq, rq); - if (unlikely(res != BLK_STS_OK)) - return BLK_STS_IOERR; + if (unlikely(READ_ONCE(ubq->fail_io))) + return BLK_STS_TARGET; /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to @@ -855,27 +1388,81 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && + unlikely(READ_ONCE(ubq->force_abort))) + return BLK_STS_IOERR; + + if (check_cancel && unlikely(ubq->canceling)) + return BLK_STS_IOERR; + + /* fill iod to slot in io cmd buffer */ + res = ublk_setup_iod(ubq, rq); + if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR; - blk_mq_start_request(bd->rq); + blk_mq_start_request(rq); + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; - if (unlikely(ubq_daemon_is_dying(ubq))) { + res = ublk_prep_req(ubq, rq, false); + if (res != BLK_STS_OK) + return res; + + /* + * ->canceling has to be handled after ->force_abort and ->fail_io + * is dealt with, otherwise this request may not be failed in case + * of recovery, and cause hang when deleting disk + */ + if (unlikely(ubq->canceling)) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - ublk_queue_cmd(ubq, rq, bd->last); - + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } -static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, + const struct ublk_io *io2) { - struct ublk_queue *ubq = hctx->driver_data; + return (io_uring_cmd_ctx_handle(io->cmd) == + io_uring_cmd_ctx_handle(io2->cmd)) && + (io->task == io2->task); +} + +static void ublk_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_io *io = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + struct ublk_io *this_io = &this_q->ios[req->tag]; - if (ublk_can_use_task_work(ubq)) - __set_notify_signal(ubq->ubq_daemon); + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { + rq_list_add_tail(&requeue_list, req); + continue; + } + + if (io && !ublk_belong_to_same_batch(io, this_io) && + !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(io, &submit_list); + io = this_io; + rq_list_add_tail(&submit_list, req); + } + + if (!rq_list_empty(&submit_list)) + ublk_queue_cmd_list(io, &submit_list); + *rqlist = requeue_list; } static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, @@ -888,22 +1475,44 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, return 0; } -static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - init_task_work(&data->work, ublk_rq_task_work_fn); - return 0; -} - static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, - .commit_rqs = ublk_commit_rqs, + .queue_rqs = ublk_queue_rqs, .init_hctx = ublk_init_hctx, - .init_request = ublk_init_rq, + .timeout = ublk_timeout, }; +static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + /* + * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch + * io->cmd + */ + io->flags &= UBLK_IO_FLAG_CANCELED; + io->cmd = NULL; + io->buf.addr = 0; + + /* + * old task is PF_EXITING, put it now + * + * It could be NULL in case of closing one quiesced + * device. + */ + if (io->task) { + put_task_struct(io->task); + io->task = NULL; + } + + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); + } +} + static int ublk_ch_open(struct inode *inode, struct file *filp) { struct ublk_device *ub = container_of(inode->i_cdev, @@ -912,14 +1521,209 @@ static int ublk_ch_open(struct inode *inode, struct file *filp) if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) return -EBUSY; filp->private_data = ub; + ub->ublksrv_tgid = current->tgid; return 0; } +static void ublk_reset_ch_dev(struct ublk_device *ub) +{ + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + + /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ + ub->mm = NULL; + ub->nr_io_ready = 0; + ub->unprivileged_daemons = false; + ub->ublksrv_tgid = -1; +} + +static struct gendisk *ublk_get_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + spin_lock(&ub->lock); + disk = ub->ub_disk; + if (disk) + get_device(disk_to_dev(disk)); + spin_unlock(&ub->lock); + + return disk; +} + +static void ublk_put_disk(struct gendisk *disk) +{ + if (disk) + put_device(disk_to_dev(disk)); +} + +/* + * Use this function to ensure that ->canceling is consistently set for + * the device and all queues. Do not set these flags directly. + * + * Caller must ensure that: + * - cancel_mutex is held. This ensures that there is no concurrent + * access to ub->canceling and no concurrent writes to ubq->canceling. + * - there are no concurrent reads of ubq->canceling from the queue_rq + * path. This can be done by quiescing the queue, or through other + * means. + */ +static void ublk_set_canceling(struct ublk_device *ub, bool canceling) + __must_hold(&ub->cancel_mutex) +{ + int i; + + ub->canceling = canceling; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->canceling = canceling; +} + +static bool ublk_check_and_reset_active_ref(struct ublk_device *ub) +{ + int i, j; + + if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG))) + return false; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + for (j = 0; j < ubq->q_depth; j++) { + struct ublk_io *io = &ubq->ios[j]; + unsigned int refs = refcount_read(&io->ref) + + io->task_registered_buffers; + + /* + * UBLK_REFCOUNT_INIT or zero means no active + * reference + */ + if (refs != UBLK_REFCOUNT_INIT && refs != 0) + return true; + + /* reset to zero if the io hasn't active references */ + refcount_set(&io->ref, 0); + io->task_registered_buffers = 0; + } + } + return false; +} + +static void ublk_ch_release_work_fn(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, exit_work.work); + struct gendisk *disk; + int i; + + /* + * For zero-copy and auto buffer register modes, I/O references + * might not be dropped naturally when the daemon is killed, but + * io_uring guarantees that registered bvec kernel buffers are + * unregistered finally when freeing io_uring context, then the + * active references are dropped. + * + * Wait until active references are dropped for avoiding use-after-free + * + * registered buffer may be unregistered in io_ring's release hander, + * so have to wait by scheduling work function for avoiding the two + * file release dependency. + */ + if (ublk_check_and_reset_active_ref(ub)) { + schedule_delayed_work(&ub->exit_work, 1); + return; + } + + /* + * disk isn't attached yet, either device isn't live, or it has + * been removed already, so we needn't to do anything + */ + disk = ublk_get_disk(ub); + if (!disk) + goto out; + + /* + * All uring_cmd are done now, so abort any request outstanding to + * the ublk server + * + * This can be done in lockless way because ublk server has been + * gone + * + * More importantly, we have to provide forward progress guarantee + * without holding ub->mutex, otherwise control task grabbing + * ub->mutex triggers deadlock + * + * All requests may be inflight, so ->canceling may not be set, set + * it now. + */ + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, true); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_abort_queue(ub, ublk_get_queue(ub, i)); + mutex_unlock(&ub->cancel_mutex); + blk_mq_kick_requeue_list(disk->queue); + + /* + * All infligh requests have been completed or requeued and any new + * request will be failed or requeued via `->canceling` now, so it is + * fine to grab ub->mutex now. + */ + mutex_lock(&ub->mutex); + + /* double check after grabbing lock */ + if (!ub->ub_disk) + goto unlock; + + /* + * Transition the device to the nosrv state. What exactly this + * means depends on the recovery flags + */ + if (ublk_nosrv_should_stop_dev(ub)) { + /* + * Allow any pending/future I/O to pass through quickly + * with an error. This is needed because del_gendisk + * waits for all pending I/O to complete + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true); + + ublk_stop_dev_unlocked(ub); + } else { + if (ublk_nosrv_dev_should_queue_io(ub)) { + /* ->canceling is set and all requests are aborted */ + ub->dev_info.state = UBLK_S_DEV_QUIESCED; + } else { + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true); + } + } +unlock: + mutex_unlock(&ub->mutex); + ublk_put_disk(disk); + + /* all uring_cmd has been done now, reset device & ubq */ + ublk_reset_ch_dev(ub); +out: + clear_bit(UB_STATE_OPEN, &ub->state); + + /* put the reference grabbed in ublk_ch_release() */ + ublk_put_device(ub); +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; - clear_bit(UB_STATE_OPEN, &ub->state); + /* + * Grab ublk device reference, so it won't be gone until we are + * really released from work function. + */ + ublk_get_device(ub); + + INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn); + schedule_delayed_work(&ub->exit_work, 0); return 0; } @@ -928,16 +1732,16 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) { struct ublk_device *ub = filp->private_data; size_t sz = vma->vm_end - vma->vm_start; - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned max_sz = ublk_max_cmd_buf_size(); unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; - spin_lock(&ub->mm_lock); + spin_lock(&ub->lock); if (!ub->mm) ub->mm = current->mm; if (current->mm != ub->mm) ret = -EINVAL; - spin_unlock(&ub->mm_lock); + spin_unlock(&ub->lock); if (ret) return ret; @@ -954,115 +1758,160 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) __func__, q_id, current->pid, vma->vm_start, phys_off, (unsigned long)sz); - if (sz != ublk_queue_cmd_buf_size(ub, q_id)) + if (sz != ublk_queue_cmd_buf_size(ub)) return -EINVAL; pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } -static void ublk_commit_completion(struct ublk_device *ub, - struct ublksrv_io_cmd *ub_cmd) +static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, + struct request *req) { - u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; - struct ublk_queue *ubq = ublk_get_queue(ub, qid); - struct ublk_io *io = &ubq->ios[tag]; - struct request *req; - - /* now this cmd slot is owned by nbd driver */ - io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; - io->res = ub_cmd->result; - - /* find the io request and complete */ - req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (req && likely(!blk_should_fake_timeout(req->q))) - ublk_complete_rq(req); + if (ublk_nosrv_should_reissue_outstanding(ub)) + blk_mq_requeue_request(req, false); + else { + io->res = -EIO; + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + } } /* - * When ->ubq_daemon is exiting, either new request is ended immediately, - * or any queued io command is drained, so it is safe to abort queue - * lockless + * Called from ublk char device release handler, when any uring_cmd is + * done, meantime request queue is "quiesced" since all inflight requests + * can't be completed because ublk server is dead. + * + * So no one can hold our request IO reference any more, simply ignore the + * reference, and complete the request immediately */ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) { int i; - if (!ublk_get_device(ub)) - return; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; - if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { - struct request *rq; - - /* - * Either we fail the request or ublk_rq_task_work_fn - * will do it - */ - rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); - if (rq) - __ublk_fail_req(ubq, io, rq); - } + if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) + __ublk_fail_req(ub, io, io->req); } - ublk_put_device(ub); } -static void ublk_daemon_monitor_work(struct work_struct *work) +static void ublk_start_cancel(struct ublk_device *ub) { - struct ublk_device *ub = - container_of(work, struct ublk_device, monitor_work.work); - int i; - - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); + struct gendisk *disk = ublk_get_disk(ub); - if (ubq_daemon_is_dying(ubq)) { - if (ublk_queue_can_use_recovery(ubq)) - schedule_work(&ub->quiesce_work); - else - schedule_work(&ub->stop_work); - - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); - } - } + /* Our disk has been dead */ + if (!disk) + return; + mutex_lock(&ub->cancel_mutex); + if (ub->canceling) + goto out; /* - * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE. - * after ublk_remove() or __ublk_quiesce_dev() is started. + * Now we are serialized with ublk_queue_rq() * - * No need ub->mutex, monitor work are canceled after state is marked - * as not LIVE, so new state is observed reliably. + * Make sure that ubq->canceling is set when queue is frozen, + * because ublk_queue_rq() has to rely on this flag for avoiding to + * touch completed uring_cmd */ - if (ub->dev_info.state == UBLK_S_DEV_LIVE) - schedule_delayed_work(&ub->monitor_work, - UBLK_DAEMON_MONITOR_PERIOD); + blk_mq_quiesce_queue(disk->queue); + ublk_set_canceling(ub, true); + blk_mq_unquiesce_queue(disk->queue); +out: + mutex_unlock(&ub->cancel_mutex); + ublk_put_disk(disk); } -static inline bool ublk_queue_ready(struct ublk_queue *ubq) +static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, + unsigned int issue_flags) { - return ubq->nr_io_ready == ubq->q_depth; + struct ublk_io *io = &ubq->ios[tag]; + struct ublk_device *ub = ubq->dev; + struct request *req; + bool done; + + if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) + return; + + /* + * Don't try to cancel this command if the request is started for + * avoiding race between io_uring_cmd_done() and + * io_uring_cmd_complete_in_task(). + * + * Either the started request will be aborted via __ublk_abort_rq(), + * then this uring_cmd is canceled next time, or it will be done in + * task work function ublk_dispatch_req() because io_uring guarantees + * that ublk_dispatch_req() is always called + */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (req && blk_mq_request_started(req) && req->tag == tag) + return; + + spin_lock(&ubq->cancel_lock); + done = !!(io->flags & UBLK_IO_FLAG_CANCELED); + if (!done) + io->flags |= UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + + if (!done) + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } -static void ublk_cancel_queue(struct ublk_queue *ubq) +/* + * The ublk char device won't be closed when calling cancel fn, so both + * ublk device and queue are guaranteed to be live + * + * Two-stage cancel: + * + * - make every active uring_cmd done in ->cancel_fn() + * + * - aborting inflight ublk IO requests in ublk char device release handler, + * which depends on 1st stage because device can only be closed iff all + * uring_cmd are done + * + * Do _not_ try to acquire ub->mutex before all inflight requests are + * aborted, otherwise deadlock may be caused. + */ +static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) { - int i; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + struct task_struct *task; + struct ublk_io *io; - if (!ublk_queue_ready(ubq)) + if (WARN_ON_ONCE(!ubq)) return; - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; + if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) + return; - if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); - } + task = io_uring_cmd_get_task(cmd); + io = &ubq->ios[pdu->tag]; + if (WARN_ON_ONCE(task && task != io->task)) + return; + + ublk_start_cancel(ubq->dev); + + WARN_ON_ONCE(io->cmd != cmd); + ublk_cancel_cmd(ubq, pdu->tag, issue_flags); +} + +static inline bool ublk_dev_ready(const struct ublk_device *ub) +{ + u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth; + + return ub->nr_io_ready == total; +} + +static void ublk_cancel_queue(struct ublk_queue *ubq) +{ + int i; - /* all io commands are canceled */ - ubq->nr_io_ready = 0; + for (i = 0; i < ubq->q_depth; i++) + ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } /* Cancel all pending commands, must be called after del_gendisk() returns */ @@ -1100,140 +1949,450 @@ static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) } } -static void __ublk_quiesce_dev(struct ublk_device *ub) +static void ublk_force_abort_dev(struct ublk_device *ub) { - pr_devel("%s: quiesce ub: dev_id %d state %s\n", + int i; + + pr_devel("%s: force abort ub: dev_id %d state %s\n", __func__, ub->dev_info.dev_id, ub->dev_info.state == UBLK_S_DEV_LIVE ? "LIVE" : "QUIESCED"); blk_mq_quiesce_queue(ub->ub_disk->queue); - ublk_wait_tagset_rqs_idle(ub); - ub->dev_info.state = UBLK_S_DEV_QUIESCED; - ublk_cancel_dev(ub); - /* we are going to release task_struct of ubq_daemon and resets - * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. - * Besides, monitor_work is not necessary in QUIESCED state since we have - * already scheduled quiesce_work and quiesced all ubqs. - * - * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel - * it here and re-schedule it in END_USER_RECOVERY to avoid UAF. - */ - cancel_delayed_work_sync(&ub->monitor_work); + if (ub->dev_info.state == UBLK_S_DEV_LIVE) + ublk_wait_tagset_rqs_idle(ub); + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->force_abort = true; + blk_mq_unquiesce_queue(ub->ub_disk->queue); + /* We may have requeued some rqs in ublk_quiesce_queue() */ + blk_mq_kick_requeue_list(ub->ub_disk->queue); } -static void ublk_quiesce_work_fn(struct work_struct *work) +static struct gendisk *ublk_detach_disk(struct ublk_device *ub) { - struct ublk_device *ub = - container_of(work, struct ublk_device, quiesce_work); + struct gendisk *disk; - mutex_lock(&ub->mutex); - if (ub->dev_info.state != UBLK_S_DEV_LIVE) - goto unlock; - __ublk_quiesce_dev(ub); - unlock: - mutex_unlock(&ub->mutex); + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ub->ub_disk = NULL; + spin_unlock(&ub->lock); + + return disk; } -static void ublk_unquiesce_dev(struct ublk_device *ub) +static void ublk_stop_dev_unlocked(struct ublk_device *ub) + __must_hold(&ub->mutex) { - int i; + struct gendisk *disk; - pr_devel("%s: unquiesce ub: dev_id %d state %s\n", - __func__, ub->dev_info.dev_id, - ub->dev_info.state == UBLK_S_DEV_LIVE ? - "LIVE" : "QUIESCED"); - /* quiesce_work has run. We let requeued rqs be aborted - * before running fallback_wq. "force_abort" must be seen - * after request queue is unqiuesced. Then del_gendisk() - * can move on. - */ - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->force_abort = true; + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + return; - blk_mq_unquiesce_queue(ub->ub_disk->queue); - /* We may have requeued some rqs in ublk_quiesce_queue() */ - blk_mq_kick_requeue_list(ub->ub_disk->queue); + if (ublk_nosrv_dev_should_queue_io(ub)) + ublk_force_abort_dev(ub); + del_gendisk(ub->ub_disk); + disk = ublk_detach_disk(ub); + put_disk(disk); } static void ublk_stop_dev(struct ublk_device *ub) { mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_DEAD) - goto unlock; - if (ublk_can_use_recovery(ub)) { - if (ub->dev_info.state == UBLK_S_DEV_LIVE) - __ublk_quiesce_dev(ub); - ublk_unquiesce_dev(ub); - } - del_gendisk(ub->ub_disk); - ub->dev_info.state = UBLK_S_DEV_DEAD; - ub->dev_info.ublksrv_pid = -1; - put_disk(ub->ub_disk); - ub->ub_disk = NULL; - unlock: - ublk_cancel_dev(ub); + ublk_stop_dev_unlocked(ub); mutex_unlock(&ub->mutex); - cancel_delayed_work_sync(&ub->monitor_work); + ublk_cancel_dev(ub); } -/* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) +/* reset ublk io_uring queue & io flags */ +static void ublk_reset_io_flags(struct ublk_device *ub) { - mutex_lock(&ub->mutex); - ubq->nr_io_ready++; - if (ublk_queue_ready(ubq)) { - ubq->ubq_daemon = current; - get_task_struct(ubq->ubq_daemon); - ub->nr_queues_ready++; + int i, j; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + for (j = 0; j < ubq->q_depth; j++) + ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + ubq->fail_io = false; } - if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, false); + mutex_unlock(&ub->cancel_mutex); +} + +/* device can only be started after all IOs are ready */ +static void ublk_mark_io_ready(struct ublk_device *ub) + __must_hold(&ub->mutex) +{ + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) + ub->unprivileged_daemons = true; + + ub->nr_io_ready++; + if (ublk_dev_ready(ub)) { + /* now we are ready for handling ublk io request */ + ublk_reset_io_flags(ub); complete_all(&ub->completion); + } +} + +static inline int ublk_check_cmd_op(u32 cmd_op) +{ + u32 ioc_type = _IOC_TYPE(cmd_op); + + if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') + return -EOPNOTSUPP; + + if (ioc_type != 'u' && ioc_type != 0) + return -EOPNOTSUPP; + + return 0; +} + +static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + struct ublk_auto_buf_reg buf; + + buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + + if (buf.reserved0 || buf.reserved1) + return -EINVAL; + + if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + return -EINVAL; + io->buf.auto_reg = buf; + return 0; +} + +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) +{ + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { + io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; + + /* + * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` + * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same + * `io_ring_ctx`. + * + * If this uring_cmd's io_ring_ctx isn't same with the + * one for registering the buffer, it is ublk server's + * responsibility for unregistering the buffer, otherwise + * this ublk request gets stuck. + */ + if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) + *buf_idx = io->buf.auto_reg.index; + } + + return ublk_set_auto_buf_reg(io, cmd); +} + +/* Once we return, `io->req` can't be used any more */ +static inline struct request * +ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + struct request *req = io->req; + + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + /* now this cmd slot is owned by ublk driver */ + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + + return req; +} + +static inline int +ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr, + u16 *buf_idx) +{ + if (ublk_dev_support_auto_buf_reg(ub)) + return ublk_handle_auto_buf_reg(io, cmd, buf_idx); + + io->buf.addr = buf_addr; + return 0; +} + +static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct ublk_queue *ubq, unsigned int tag) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + /* + * Safe to refer to @ubq since ublk_queue won't be died until its + * commands are completed + */ + pdu->ubq = ubq; + pdu->tag = tag; + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} + +static void ublk_io_release(void *priv) +{ + struct request *rq = priv; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[rq->tag]; + + /* + * task_registered_buffers may be 0 if buffers were registered off task + * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ. + */ + if (current == io->task && io->task_registered_buffers) + io->task_registered_buffers--; + else + ublk_put_req_ref(io, rq); +} + +static int ublk_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_device *ub, + u16 q_id, u16 tag, + struct ublk_io *io, + unsigned int index, unsigned int issue_flags) +{ + struct request *req; + int ret; + + if (!ublk_dev_support_zero_copy(ub)) + return -EINVAL; + + req = __ublk_check_and_get_req(ub, q_id, tag, io, 0); + if (!req) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) { + ublk_put_req_ref(io, req); + return ret; + } + + return 0; +} + +static int +ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_device *ub, + u16 q_id, u16 tag, struct ublk_io *io, + unsigned index, unsigned issue_flags) +{ + unsigned new_registered_buffers; + struct request *req = io->req; + int ret; + + /* + * Ensure there are still references for ublk_sub_req_ref() to release. + * If not, fall back on the thread-safe buffer registration. + */ + new_registered_buffers = io->task_registered_buffers + 1; + if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); + + if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req)) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) + return ret; + + io->task_registered_buffers = new_registered_buffers; + return 0; +} + +static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + const struct ublk_device *ub, + unsigned int index, unsigned int issue_flags) +{ + if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) + return -EINVAL; + + return io_buffer_unregister_bvec(cmd, index, issue_flags); +} + +static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) +{ + if (ublk_dev_need_map_io(ub)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!buf_addr && !ublk_dev_need_get_data(ub)) + return -EINVAL; + } else if (buf_addr) { + /* User copy requires addr to be unset */ + return -EINVAL; + } + return 0; +} + +static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io) +{ + /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ + if (ublk_dev_ready(ub)) + return -EBUSY; + + /* allow each command to be FETCHed at most once */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) + return -EINVAL; + + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); + + ublk_fill_io_cmd(io, cmd); + + WRITE_ONCE(io->task, get_task_struct(current)); + ublk_mark_io_ready(ub); + + return 0; +} + +static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io, __u64 buf_addr) +{ + int ret; + + /* + * When handling FETCH command for setting up ublk uring queue, + * ub->mutex is the innermost lock, and we won't block for handling + * FETCH, so it is fine even for IO_URING_F_NONBLOCK. + */ + mutex_lock(&ub->mutex); + ret = __ublk_fetch(cmd, ub, io); + if (!ret) + ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); mutex_unlock(&ub->mutex); + return ret; } -static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, - int tag) +static int ublk_check_commit_and_fetch(const struct ublk_device *ub, + struct ublk_io *io, __u64 buf_addr) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); + struct request *req = io->req; - ublk_queue_cmd(ubq, req, true); + if (ublk_dev_need_map_io(ub)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!buf_addr && (!ublk_dev_need_get_data(ub) || + req_op(req) == REQ_OP_READ)) + return -EINVAL; + } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ + return -EINVAL; + } + + return 0; } -static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +static bool ublk_need_complete_req(const struct ublk_device *ub, + struct ublk_io *io) +{ + if (ublk_dev_need_req_ref(ub)) + return ublk_sub_req_ref(io); + return true; +} + +static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) +{ + /* + * We have handled UBLK_IO_NEED_GET_DATA command, + * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just + * do the copy work. + */ + io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; + /* update iod->addr because ublksrv may have passed a new io buffer */ + ublk_get_iod(ubq, req->tag)->addr = io->buf.addr; + pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", + __func__, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + + return ublk_start_io(ubq, req, io); +} + +static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, + unsigned int issue_flags) { - struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; + /* May point to userspace-mapped memory */ + const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); + u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; - struct ublk_io *io; + struct ublk_io *io = NULL; u32 cmd_op = cmd->cmd_op; - unsigned tag = ub_cmd->tag; - int ret = -EINVAL; + u16 q_id = READ_ONCE(ub_src->q_id); + u16 tag = READ_ONCE(ub_src->tag); + s32 result = READ_ONCE(ub_src->result); + u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */ + struct request *req; + int ret; + bool compl; + + WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", - __func__, cmd->cmd_op, ub_cmd->q_id, tag, - ub_cmd->result); + __func__, cmd->cmd_op, q_id, tag, result); - if (!(issue_flags & IO_URING_F_SQE128)) + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) - goto out; + /* + * io_buffer_unregister_bvec() doesn't access the ubq or io, + * so no need to validate the q_id, tag, or task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, addr, issue_flags); - ubq = ublk_get_queue(ub, ub_cmd->q_id); - if (!ubq || ub_cmd->q_id != ubq->q_id) + ret = -EINVAL; + if (q_id >= ub->dev_info.nr_hw_queues) goto out; - if (ubq->ubq_daemon && ubq->ubq_daemon != current) - goto out; + ubq = ublk_get_queue(ub, q_id); - if (tag >= ubq->q_depth) + if (tag >= ub->dev_info.queue_depth) goto out; io = &ubq->ios[tag]; + /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ + if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { + ret = ublk_check_fetch_buf(ub, addr); + if (ret) + goto out; + ret = ublk_fetch(cmd, ub, io, addr); + if (ret) + goto out; + + ublk_prep_cancel(cmd, issue_flags, ubq, tag); + return -EIOCBQUEUED; + } + + if (READ_ONCE(io->task) != current) { + /* + * ublk_register_io_buf() accesses only the io's refcount, + * so can be handled on any task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) + return ublk_register_io_buf(cmd, ub, q_id, tag, io, + addr, issue_flags); + + goto out; + } /* there is pending io cmd, something must be wrong */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { ret = -EBUSY; goto out; } @@ -1243,130 +2402,312 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. */ if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) - ^ (cmd_op == UBLK_IO_NEED_GET_DATA)) + ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - switch (cmd_op) { - case UBLK_IO_FETCH_REQ: - /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ - if (ublk_queue_ready(ubq)) { - ret = -EBUSY; - goto out; - } - /* - * The io is being handled by server, so COMMIT_RQ is expected - * instead of FETCH_REQ - */ - if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) - goto out; - /* FETCH_RQ has to provide IO buffer */ - if (!ub_cmd->addr) - goto out; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->addr = ub_cmd->addr; - - ublk_mark_io_ready(ub, ubq); - break; + switch (_IOC_NR(cmd_op)) { + case UBLK_IO_REGISTER_IO_BUF: + return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr, + issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: - /* FETCH_RQ has to provide IO buffer */ - if (!ub_cmd->addr) + ret = ublk_check_commit_and_fetch(ub, io, addr); + if (ret) goto out; - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + io->res = result; + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); + compl = ublk_need_complete_req(ub, io); + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = addr; + if (compl) + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + + if (ret) goto out; - io->addr = ub_cmd->addr; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->cmd = cmd; - ublk_commit_completion(ub, ub_cmd); break; case UBLK_IO_NEED_GET_DATA: - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) - goto out; - io->addr = ub_cmd->addr; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; - ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); + /* + * ublk_get_data() may fail and fallback to requeue, so keep + * uring_cmd active first and prepare for handling new requeued + * request + */ + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ub, io, cmd, addr, NULL); + WARN_ON_ONCE(ret); + if (likely(ublk_get_data(ubq, io, req))) { + __ublk_prep_compl_io_cmd(io, req); + return UBLK_IO_RES_OK; + } break; default: goto out; } + ublk_prep_cancel(cmd, issue_flags, ubq, tag); return -EIOCBQUEUED; out: - io_uring_cmd_done(cmd, ret, 0); pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", - __func__, cmd_op, tag, ret, io->flags); - return -EIOCBQUEUED; + __func__, cmd_op, tag, ret, io ? io->flags : 0); + return ret; +} + +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + u16 q_id, u16 tag, struct ublk_io *io, size_t offset) +{ + struct request *req; + + /* + * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, + * which would overwrite it with io->cmd + */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); + if (!req) + return NULL; + + if (!ublk_get_req_ref(io)) + return NULL; + + if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) + goto fail_put; + + if (!ublk_rq_has_data(req)) + goto fail_put; + + if (offset > blk_rq_bytes(req)) + goto fail_put; + + return req; +fail_put: + ublk_put_req_ref(io, req); + return NULL; +} + +static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); + int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(cmd, ret, issue_flags); +} + +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_uring_cmd_cancel_fn(cmd, issue_flags); + return 0; + } + + /* well-implemented server won't run into unlocked */ + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); + return -EIOCBQUEUED; + } + + return ublk_ch_uring_cmd_local(cmd, issue_flags); +} + +static inline bool ublk_check_ubuf_dir(const struct request *req, + int ubuf_dir) +{ + /* copy ubuf to request pages */ + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && + ubuf_dir == ITER_SOURCE) + return true; + + /* copy request pages to ubuf */ + if ((req_op(req) == REQ_OP_WRITE || + req_op(req) == REQ_OP_ZONE_APPEND) && + ubuf_dir == ITER_DEST) + return true; + + return false; +} + +static struct request *ublk_check_and_get_req(struct kiocb *iocb, + struct iov_iter *iter, size_t *off, int dir, + struct ublk_io **io) +{ + struct ublk_device *ub = iocb->ki_filp->private_data; + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + u16 tag, q_id; + + if (!user_backed_iter(iter)) + return ERR_PTR(-EACCES); + + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + return ERR_PTR(-EACCES); + + tag = ublk_pos_to_tag(iocb->ki_pos); + q_id = ublk_pos_to_hwq(iocb->ki_pos); + buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + + if (q_id >= ub->dev_info.nr_hw_queues) + return ERR_PTR(-EINVAL); + + ubq = ublk_get_queue(ub, q_id); + if (!ublk_dev_support_user_copy(ub)) + return ERR_PTR(-EACCES); + + if (tag >= ub->dev_info.queue_depth) + return ERR_PTR(-EINVAL); + + *io = &ubq->ios[tag]; + req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off); + if (!req) + return ERR_PTR(-EINVAL); + + if (!ublk_check_ubuf_dir(req, dir)) + goto fail; + + *off = buf_off; + return req; +fail: + ublk_put_req_ref(*io, req); + return ERR_PTR(-EACCES); +} + +static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct request *req; + struct ublk_io *io; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); + ublk_put_req_ref(io, req); + + return ret; +} + +static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct request *req; + struct ublk_io *io; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); + ublk_put_req_ref(io, req); + + return ret; } static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, .release = ublk_ch_release, - .llseek = no_llseek, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, .mmap = ublk_ch_mmap, }; static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { - int size = ublk_queue_cmd_buf_size(ub, q_id); - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + struct ublk_queue *ubq = ub->queues[q_id]; + int size, i; + + if (!ubq) + return; + + size = ublk_queue_cmd_buf_size(ub); + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + if (io->task) + put_task_struct(io->task); + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); + } - if (ubq->ubq_daemon) - put_task_struct(ubq->ubq_daemon); if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + + kvfree(ubq); + ub->queues[q_id] = NULL; +} + +static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id) +{ + unsigned int cpu; + + /* Find first CPU mapped to this queue */ + for_each_possible_cpu(cpu) { + if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id) + return cpu_to_node(cpu); + } + + return NUMA_NO_NODE; } static int ublk_init_queue(struct ublk_device *ub, int q_id) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + int depth = ub->dev_info.queue_depth; gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; - void *ptr; + struct ublk_queue *ubq; + struct page *page; + int numa_node; int size; + /* Determine NUMA node based on queue's CPU affinity */ + numa_node = ublk_get_queue_numa_node(ub, q_id); + + /* Allocate queue structure on local NUMA node */ + ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL, + numa_node); + if (!ubq) + return -ENOMEM; + + spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; - ubq->q_depth = ub->dev_info.queue_depth; - size = ublk_queue_cmd_buf_size(ub, q_id); + ubq->q_depth = depth; + size = ublk_queue_cmd_buf_size(ub); - ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); - if (!ptr) + /* Allocate I/O command buffer on local NUMA node */ + page = alloc_pages_node(numa_node, gfp_flags, get_order(size)); + if (!page) { + kvfree(ubq); return -ENOMEM; + } + ubq->io_cmd_buf = page_address(page); - ubq->io_cmd_buf = ptr; + ub->queues[q_id] = ubq; ubq->dev = ub; return 0; } static void ublk_deinit_queues(struct ublk_device *ub) { - int nr_queues = ub->dev_info.nr_hw_queues; int i; - if (!ub->__queues) - return; - - for (i = 0; i < nr_queues; i++) + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_deinit_queue(ub, i); - kfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) { - int nr_queues = ub->dev_info.nr_hw_queues; - int depth = ub->dev_info.queue_depth; - int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); - int i, ret = -ENOMEM; - - ub->queue_size = ubq_size; - ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); - if (!ub->__queues) - return ret; + int i, ret; - for (i = 0; i < nr_queues; i++) { - if (ublk_init_queue(ub, i)) + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ret = ublk_init_queue(ub, i); + if (ret) goto fail; } @@ -1390,7 +2731,8 @@ static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) if (err == -ENOSPC) err = -EEXIST; } else { - err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT); + err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS, + GFP_NOWAIT); } spin_unlock(&ublk_idr_lock); @@ -1416,6 +2758,7 @@ static void ublk_cdev_rel(struct device *dev) ublk_deinit_queues(ub); ublk_free_dev_number(ub); mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); } @@ -1427,7 +2770,7 @@ static int ublk_add_chdev(struct ublk_device *ub) dev->parent = ublk_misc.this_device; dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); - dev->class = ublk_chr_class; + dev->class = &ublk_chr_class; dev->release = ublk_cdev_rel; device_initialize(dev); @@ -1439,20 +2782,15 @@ static int ublk_add_chdev(struct ublk_device *ub) ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; + + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) + unprivileged_ublks_added++; return 0; fail: put_device(dev); return ret; } -static void ublk_stop_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, stop_work); - - ublk_stop_dev(ub); -} - /* align max io buffer size with PAGE_SIZE */ static void ublk_align_max_io_size(struct ublk_device *ub) { @@ -1468,19 +2806,21 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; - ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); - ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } static void ublk_remove(struct ublk_device *ub) { + bool unprivileged; + ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); - put_device(&ub->cdev_dev); + unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; + ublk_put_device(ub); + + if (unprivileged) + unprivileged_ublks_added--; } static struct ublk_device *ublk_get_device_from_id(int idx) @@ -1499,24 +2839,76 @@ static struct ublk_device *ublk_get_device_from_id(int idx) return ub; } -static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_dev(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublk_param_basic *p = &ub->params.basic; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; + struct queue_limits lim = { + .logical_block_size = 1 << p->logical_bs_shift, + .physical_block_size = 1 << p->physical_bs_shift, + .io_min = 1 << p->io_min_shift, + .io_opt = 1 << p->io_opt_shift, + .max_hw_sectors = p->max_sectors, + .chunk_sectors = p->chunk_sectors, + .virt_boundary_mask = p->virt_boundary_mask, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + .dma_alignment = 3, + }; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; - - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) return -EINVAL; - wait_for_completion_interruptible(&ub->completion); + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { + const struct ublk_param_discard *pd = &ub->params.discard; + + lim.discard_alignment = pd->discard_alignment; + lim.discard_granularity = pd->discard_granularity; + lim.max_hw_discard_sectors = pd->max_discard_sectors; + lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; + lim.max_discard_segments = pd->max_discard_segments; + } + + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { + const struct ublk_param_zoned *p = &ub->params.zoned; + + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return -EOPNOTSUPP; + + lim.features |= BLK_FEAT_ZONED; + lim.max_active_zones = p->max_active_zones; + lim.max_open_zones = p->max_open_zones; + lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; + } + + if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (ub->params.basic.attrs & UBLK_ATTR_FUA) + lim.features |= BLK_FEAT_FUA; + } + + if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) + lim.dma_alignment = ub->params.dma.alignment; + + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; + lim.max_segment_size = ub->params.seg.max_segment_size; + lim.max_segments = ub->params.seg.max_segments; + } + + if (wait_for_completion_interruptible(&ub->completion) != 0) + return -EINTR; - schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE || @@ -1525,7 +2917,7 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, ub); + disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; @@ -1537,42 +2929,49 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) ub->dev_info.ublksrv_pid = ublksrv_pid; ub->ub_disk = disk; - ret = ublk_apply_params(ub); - if (ret) - goto out_put_disk; + ublk_apply_params(ub); + + /* don't probe partitions if any daemon task is un-trusted */ + if (ub->unprivileged_daemons) + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + + ublk_get_device(ub); + ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_dev_is_zoned(ub)) { + ret = ublk_revalidate_disk_zones(ub); + if (ret) + goto out_put_cdev; + } - get_device(&ub->cdev_dev); ret = add_disk(disk); + if (ret) + goto out_put_cdev; + + set_bit(UB_STATE_USED, &ub->state); + +out_put_cdev: if (ret) { - /* - * Has to drop the reference since ->free_disk won't be - * called in case of add_disk failure. - */ + ublk_detach_disk(ub); ublk_put_device(ub); - goto out_put_disk; } - set_bit(UB_STATE_USED, &ub->state); - ub->dev_info.state = UBLK_S_DEV_LIVE; -out_put_disk: if (ret) put_disk(disk); out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; cpumask_var_t cpumask; unsigned long queue; unsigned int retlen; unsigned int i; - int ret = -EINVAL; - + int ret; + if (header->len * BITS_PER_BYTE < nr_cpu_ids) return -EINVAL; if (header->len & (sizeof(unsigned long)-1)) @@ -1580,17 +2979,12 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) if (!header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - queue = header->data[0]; if (queue >= ub->dev_info.nr_hw_queues) - goto out_put_device; + return -EINVAL; - ret = -ENOMEM; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) - goto out_put_device; + return -ENOMEM; for_each_possible_cpu(i) { if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) @@ -1608,8 +3002,6 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) ret = 0; out_free_cpumask: free_cpumask_var(cpumask); -out_put_device: - ublk_put_device(ub); return ret; } @@ -1621,9 +3013,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) info->nr_hw_queues, info->queue_depth); } -static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublksrv_ctrl_dev_info info; struct ublk_device *ub; @@ -1636,28 +3027,94 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) __func__, header->queue_id); return -EINVAL; } + if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - ublk_dump_dev_info(&info); + + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) + return -EINVAL; + + if (capable(CAP_SYS_ADMIN)) + info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; + else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) + return -EPERM; + + /* forbid nonsense combinations of recovery flags */ + switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { + case 0: + case UBLK_F_USER_RECOVERY: + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): + break; + default: + pr_warn("%s: invalid recovery flags %llx\n", __func__, + info.flags & UBLK_F_ALL_RECOVERY_FLAGS); + return -EINVAL; + } + + if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) { + pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n"); + return -EINVAL; + } + + /* + * unprivileged device can't be trusted, but RECOVERY and + * RECOVERY_REISSUE still may hang error handling, so can't + * support recovery features for unprivileged ublk now + * + * TODO: provide forward progress for RECOVERY handler, so that + * unprivileged device can benefit from it + */ + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { + info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | + UBLK_F_USER_RECOVERY); + + /* + * For USER_COPY, we depends on userspace to fill request + * buffer by pwrite() to ublk char device, which can't be + * used for unprivileged device + * + * Same with zero copy or auto buffer register. + */ + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG)) + return -EINVAL; + } + + /* the created device is always owned by current user */ + ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); + if (header->dev_id != info.dev_id) { pr_warn("%s: dev id not match %u %u\n", __func__, header->dev_id, info.dev_id); return -EINVAL; } + if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) { + pr_warn("%s: dev id is too large. Max supported is %d\n", + __func__, UBLK_MAX_UBLKS - 1); + return -EINVAL; + } + + ublk_dump_dev_info(&info); + ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; + ret = -EACCES; + if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) && + unprivileged_ublks_added >= unprivileged_ublks_max) + goto out_unlock; + ret = -ENOMEM; - ub = kzalloc(sizeof(*ub), GFP_KERNEL); + ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL); if (!ub) goto out_unlock; mutex_init(&ub->mutex); - spin_lock_init(&ub->mm_lock); - INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); - INIT_WORK(&ub->stop_work, ublk_stop_work_fn); - INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); + spin_lock_init(&ub->lock); + mutex_init(&ub->cancel_mutex); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -1676,27 +3133,43 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) */ ub->dev_info.flags &= UBLK_F_ALL; - if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) - ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; + ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | + UBLK_F_URING_CMD_COMP_IN_TASK | + UBLK_F_PER_IO_DAEMON | + UBLK_F_BUF_REG_OFF_DAEMON; - /* We are not ready to support zero copy */ - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ + if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG)) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + + /* + * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for + * returning write_append_lba, which is only allowed in case of + * user copy or zero copy + */ + if (ublk_dev_is_zoned(ub) && + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags & + (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) { + ret = -EINVAL; + goto out_free_dev_number; + } ub->dev_info.nr_hw_queues = min_t(unsigned int, ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); - ret = ublk_init_queues(ub); + ret = ublk_add_tag_set(ub); if (ret) goto out_free_dev_number; - ret = ublk_add_tag_set(ub); + ret = ublk_init_queues(ub); if (ret) - goto out_deinit_queues; + goto out_free_tag_set; ret = -EFAULT; if (copy_to_user(argp, &ub->dev_info, sizeof(info))) - goto out_free_tag_set; + goto out_deinit_queues; /* * Add the char dev so that ublksrv daemon can be setup. @@ -1705,14 +3178,15 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) ret = ublk_add_chdev(ub); goto out_unlock; -out_free_tag_set: - blk_mq_free_tag_set(&ub->tag_set); out_deinit_queues: ublk_deinit_queues(ub); +out_free_tag_set: + blk_mq_free_tag_set(&ub->tag_set); out_free_dev_number: ublk_free_dev_number(ub); out_free_ub: mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); out_unlock: mutex_unlock(&ublk_ctl_mutex); @@ -1730,88 +3204,95 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(int idx) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) { - struct ublk_device *ub; + struct ublk_device *ub = *p_ub; + int idx = ub->ub_number; int ret; ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; - ub = ublk_get_device_from_id(idx); - if (ub) { + if (!test_bit(UB_STATE_DELETED, &ub->state)) { ublk_remove(ub); - ublk_put_device(ub); - ret = 0; - } else { - ret = -ENODEV; + set_bit(UB_STATE_DELETED, &ub->state); } + /* Mark the reference as consumed */ + *p_ub = NULL; + ublk_put_device(ub); + mutex_unlock(&ublk_ctl_mutex); + /* * Wait until the idr is removed, then it can be reused after * DEL_DEV command is returned. + * + * If we returns because of user interrupt, future delete command + * may come: + * + * - the device number isn't freed, this device won't or needn't + * be deleted again, since UB_STATE_DELETED is set, and device + * will be released after the last reference is dropped + * + * - the device number is freed already, we will not find this + * device via ublk_get_device_from_id() */ - if (!ret) - wait_event(ublk_idr_wq, ublk_idr_freed(idx)); - mutex_unlock(&ublk_ctl_mutex); - - return ret; + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + return -EINTR; + return 0; } static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", __func__, cmd->cmd_op, header->dev_id, header->queue_id, header->data[0], header->addr, header->len); } -static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_stop_dev(struct ublk_device *ub) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; - - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); - - ublk_put_device(ub); return 0; } -static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_dev_info(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; - int ret = 0; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) - ret = -EFAULT; - ublk_put_device(ub); + return -EFAULT; - return ret; + return 0; } -static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) +/* TYPE_DEVT is readonly, so fill it up before returning to userspace */ +static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) +{ + ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); + ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); + + if (ub->ub_disk) { + ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); + ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); + } else { + ub->params.devt.disk_major = 0; + ub->params.devt.disk_minor = 0; + } + ub->params.types |= UBLK_PARAM_TYPE_DEVT; +} + +static int ublk_ctrl_get_params(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret; if (header->len <= sizeof(ph) || !header->addr) @@ -1826,27 +3307,22 @@ static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - mutex_lock(&ub->mutex); + ublk_ctrl_fill_params_devt(ub); if (copy_to_user(argp, &ub->params, ph.len)) ret = -EFAULT; else ret = 0; mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_set_params(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret = -EFAULT; if (header->len <= sizeof(ph) || !header->addr) @@ -1861,13 +3337,12 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; @@ -1875,48 +3350,21 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) /* clear all we don't support yet */ ub->params.types &= UBLK_PARAM_TYPE_ALL; ret = ublk_validate_params(ub); + if (ret) + ub->params.types = 0; } mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +static int ublk_ctrl_start_recovery(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - int i; - - WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); - /* All old ioucmds have to be completed */ - WARN_ON_ONCE(ubq->nr_io_ready); - /* old daemon is PF_EXITING, put it now */ - put_task_struct(ubq->ubq_daemon); - /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ - ubq->ubq_daemon = NULL; - - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; - - /* forget everything now and be ready for new FETCH_REQ */ - io->flags = 0; - io->cmd = NULL; - io->addr = 0; - } -} - -static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) -{ - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; int ret = -EINVAL; - int i; - - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; /* * START_RECOVERY is only allowd after: @@ -1925,128 +3373,396 @@ static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) * and related io_uring ctx is freed so file struct of /dev/ublkcX is * released. * + * and one of the following holds + * * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: * (a)has quiesced request queue * (b)has requeued every inflight rqs whose io_flags is ACTIVE * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE * (d)has completed/camceled all ioucmds owned by ther dying process + * + * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not + * quiesced, but all I/O is being immediately errored */ - if (test_bit(UB_STATE_OPEN, &ub->state) || - ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_queue_reinit(ub, ublk_get_queue(ub, i)); - /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ - ub->mm = NULL; - ub->nr_queues_ready = 0; init_completion(&ub->completion); ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_end_recovery(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; int ret = -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; + pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__, + header->dev_id); - pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", - __func__, ub->dev_info.nr_hw_queues, header->dev_id); - /* wait until new ubq_daemon sending all FETCH_REQ */ - wait_for_completion_interruptible(&ub->completion); - pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", - __func__, ub->dev_info.nr_hw_queues, header->dev_id); + if (wait_for_completion_interruptible(&ub->completion)) + return -EINTR; + + pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, + header->dev_id); + + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; - if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (!ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->dev_info.state = UBLK_S_DEV_LIVE; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); blk_mq_kick_requeue_list(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; - schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) +{ + void __user *argp = (void __user *)(unsigned long)header->addr; + u64 features = UBLK_F_ALL; + + if (header->len != UBLK_FEATURES_LEN || !header->addr) + return -EINVAL; + + if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) + return -EFAULT; + + return 0; +} + +static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) +{ + struct ublk_param_basic *p = &ub->params.basic; + u64 new_size = header->data[0]; + + mutex_lock(&ub->mutex); + p->dev_sectors = new_size; + set_capacity_and_notify(ub->ub_disk, p->dev_sectors); + mutex_unlock(&ub->mutex); +} + +struct count_busy { + const struct ublk_queue *ubq; + unsigned int nr_busy; +}; + +static bool ublk_count_busy_req(struct request *rq, void *data) +{ + struct count_busy *idle = data; + + if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq) + idle->nr_busy += 1; + return true; +} + +/* uring_cmd is guaranteed to be active if the associated request is idle */ +static bool ubq_has_idle_io(const struct ublk_queue *ubq) +{ + struct count_busy data = { + .ubq = ubq, + }; + + blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data); + return data.nr_busy < ubq->q_depth; +} + +/* Wait until each hw queue has at least one idle IO */ +static int ublk_wait_for_idle_io(struct ublk_device *ub, + unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int ret; + + while (elapsed < timeout_ms && !signal_pending(current)) { + unsigned int queues_cancelable = 0; + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + queues_cancelable += !!ubq_has_idle_io(ubq); + } + + /* + * Each queue needs at least one active command for + * notifying ublk server + */ + if (queues_cancelable == ub->dev_info.nr_hw_queues) + break; + + msleep(UBLK_REQUEUE_DELAY_MS); + elapsed += UBLK_REQUEUE_DELAY_MS; + } + + if (signal_pending(current)) + ret = -EINTR; + else if (elapsed >= timeout_ms) + ret = -EBUSY; + else + ret = 0; + + return ret; +} + +static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) +{ + /* zero means wait forever */ + u64 timeout_ms = header->data[0]; + struct gendisk *disk; + int ret = -ENODEV; + + if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) + return -EOPNOTSUPP; + + mutex_lock(&ub->mutex); + disk = ublk_get_disk(ub); + if (!disk) + goto unlock; + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + goto put_disk; + + ret = 0; + /* already in expected state */ + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto put_disk; + + /* Mark the device as canceling */ + mutex_lock(&ub->cancel_mutex); + blk_mq_quiesce_queue(disk->queue); + ublk_set_canceling(ub, true); + blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&ub->cancel_mutex); + + if (!timeout_ms) + timeout_ms = UINT_MAX; + ret = ublk_wait_for_idle_io(ub, timeout_ms); + +put_disk: + ublk_put_disk(disk); +unlock: + mutex_unlock(&ub->mutex); + + /* Cancel pending uring_cmd */ + if (!ret) + ublk_cancel_dev(ub); + return ret; +} + +/* + * All control commands are sent via /dev/ublk-control, so we have to check + * the destination device's permission + */ +static int ublk_char_dev_permission(struct ublk_device *ub, + const char *dev_path, int mask) +{ + int err; + struct path path; + struct kstat stat; + + err = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (err) + return err; + + err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); + if (err) + goto exit; + + err = -EPERM; + if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) + goto exit; + + err = inode_permission(&nop_mnt_idmap, + d_backing_inode(path.dentry), mask); +exit: + path_put(&path); + return err; +} + +static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, + struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); + bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; + void __user *argp = (void __user *)(unsigned long)header->addr; + char *dev_path = NULL; + int ret = 0; + int mask; + + if (!unprivileged) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The new added command of UBLK_CMD_GET_DEV_INFO2 includes + * char_dev_path in payload too, since userspace may not + * know if the specified device is created as unprivileged + * mode. + */ + if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) + return 0; + } + + /* + * User has to provide the char device path for unprivileged ublk + * + * header->addr always points to the dev path buffer, and + * header->dev_path_len records length of dev path buffer. + */ + if (!header->dev_path_len || header->dev_path_len > PATH_MAX) + return -EINVAL; + + if (header->len < header->dev_path_len) + return -EINVAL; + + dev_path = memdup_user_nul(argp, header->dev_path_len); + if (IS_ERR(dev_path)) + return PTR_ERR(dev_path); + + ret = -EINVAL; + switch (_IOC_NR(cmd->cmd_op)) { + case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: + case UBLK_CMD_GET_QUEUE_AFFINITY: + case UBLK_CMD_GET_PARAMS: + case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): + mask = MAY_READ; + break; + case UBLK_CMD_START_DEV: + case UBLK_CMD_STOP_DEV: + case UBLK_CMD_ADD_DEV: + case UBLK_CMD_DEL_DEV: + case UBLK_CMD_SET_PARAMS: + case UBLK_CMD_START_USER_RECOVERY: + case UBLK_CMD_END_USER_RECOVERY: + case UBLK_CMD_UPDATE_SIZE: + case UBLK_CMD_QUIESCE_DEV: + mask = MAY_READ | MAY_WRITE; + break; + default: + goto exit; + } + + ret = ublk_char_dev_permission(ub, dev_path, mask); + if (!ret) { + header->len -= header->dev_path_len; + header->addr += header->dev_path_len; + } + pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", + __func__, ub->ub_number, cmd->cmd_op, + ub->dev_info.owner_uid, ub->dev_info.owner_gid, + dev_path, ret); +exit: + kfree(dev_path); return ret; } static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = NULL; + u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + ublk_ctrl_cmd_dump(cmd); if (!(issue_flags & IO_URING_F_SQE128)) goto out; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - ret = -ENODEV; - switch (cmd->cmd_op) { + if (cmd_op == UBLK_U_CMD_GET_FEATURES) { + ret = ublk_ctrl_get_features(header); + goto out; + } + + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { + ret = -ENODEV; + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + goto out; + + ret = ublk_ctrl_uring_cmd_permission(ub, cmd); + if (ret) + goto put_dev; + } + + switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(cmd); + ret = ublk_ctrl_start_dev(ub, header); break; case UBLK_CMD_STOP_DEV: - ret = ublk_ctrl_stop_dev(cmd); + ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: - ret = ublk_ctrl_get_dev_info(cmd); + case UBLK_CMD_GET_DEV_INFO2: + ret = ublk_ctrl_get_dev_info(ub, header); break; case UBLK_CMD_ADD_DEV: - ret = ublk_ctrl_add_dev(cmd); + ret = ublk_ctrl_add_dev(header); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(header->dev_id); + ret = ublk_ctrl_del_dev(&ub, true); + break; + case UBLK_CMD_DEL_DEV_ASYNC: + ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(cmd); + ret = ublk_ctrl_get_queue_affinity(ub, header); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(cmd); + ret = ublk_ctrl_get_params(ub, header); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(cmd); + ret = ublk_ctrl_set_params(ub, header); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(cmd); + ret = ublk_ctrl_start_recovery(ub, header); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(cmd); + ret = ublk_ctrl_end_recovery(ub, header); + break; + case UBLK_CMD_UPDATE_SIZE: + ublk_ctrl_set_size(ub, header); + ret = 0; + break; + case UBLK_CMD_QUIESCE_DEV: + ret = ublk_ctrl_quiesce_dev(ub, header); break; default: + ret = -EOPNOTSUPP; break; } + + put_dev: + if (ub) + ublk_put_device(ub); out: - io_uring_cmd_done(cmd, ret, 0); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); - return -EIOCBQUEUED; + return ret; } static const struct file_operations ublk_ctl_fops = { @@ -2066,6 +3782,10 @@ static int __init ublk_init(void) { int ret; + BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); + init_waitqueue_head(&ublk_idr_wq); ret = misc_register(&ublk_misc); @@ -2076,11 +3796,10 @@ static int __init ublk_init(void) if (ret) goto unregister_mis; - ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); - if (IS_ERR(ublk_chr_class)) { - ret = PTR_ERR(ublk_chr_class); + ret = class_register(&ublk_chr_class); + if (ret) goto free_chrdev_region; - } + return 0; free_chrdev_region: @@ -2095,13 +3814,12 @@ static void __exit ublk_exit(void) struct ublk_device *ub; int id; - class_destroy(ublk_chr_class); - - misc_deregister(&ublk_misc); - idr_for_each_entry(&ublk_index_idr, ub, id) ublk_remove(ub); + class_unregister(&ublk_chr_class); + misc_deregister(&ublk_misc); + idr_destroy(&ublk_index_idr); unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); } @@ -2109,5 +3827,27 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); +static int ublk_set_max_unprivileged_ublks(const char *buf, + const struct kernel_param *kp) +{ + return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); +} + +static int ublk_get_max_unprivileged_ublks(char *buf, + const struct kernel_param *kp) +{ + return sysfs_emit(buf, "%u\n", unprivileged_ublks_max); +} + +static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = { + .set = ublk_set_max_unprivileged_ublks, + .get = ublk_get_max_unprivileged_ublks, +}; + +module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops, + &unprivileged_ublks_max, 0644); +MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)"); + MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); +MODULE_DESCRIPTION("Userspace block device"); MODULE_LICENSE("GPL"); |
