diff options
Diffstat (limited to 'drivers/block/ublk_drv.c')
-rw-r--r-- | drivers/block/ublk_drv.c | 457 |
1 files changed, 292 insertions, 165 deletions
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1dfb2e77898b..ca9a67b5b537 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,6 +48,9 @@ #define UBLK_MINORS (1U << MINORBITS) +/* private ioctl command mirror */ +#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -57,7 +60,12 @@ | UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_CMD_IOCTL_ENCODE \ | UBLK_F_USER_COPY \ - | UBLK_F_ZONED) + | UBLK_F_ZONED \ + | UBLK_F_USER_RECOVERY_FAIL_IO) + +#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_USER_RECOVERY_FAIL_IO) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \ @@ -68,9 +76,6 @@ struct ublk_rq_data { struct llist_node node; struct kref ref; - __u64 sector; - __u32 operation; - __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -143,6 +148,7 @@ struct ublk_queue { bool force_abort; bool timeout; bool canceling; + bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ unsigned short nr_io_ready; /* how many ios setup */ spinlock_t cancel_lock; struct ublk_device *dev; @@ -179,8 +185,7 @@ struct ublk_device { unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; - struct work_struct quiesce_work; - struct work_struct stop_work; + struct work_struct nosrv_work; }; /* header of ublk_params */ @@ -211,6 +216,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) #ifdef CONFIG_BLK_DEV_ZONED +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + static int ublk_get_nr_zones(const struct ublk_device *ub) { const struct ublk_param_basic *p = &ub->params.basic; @@ -221,7 +253,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub) static int ublk_revalidate_disk_zones(struct ublk_device *ub) { - return blk_revalidate_disk_zones(ub->ub_disk, NULL); + return blk_revalidate_disk_zones(ub->ub_disk); } static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) @@ -246,21 +278,9 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) return 0; } -static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - const struct ublk_param_zoned *p = &ub->params.zoned; - - disk_set_zoned(ub->ub_disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - blk_queue_required_elevator_features(ub->ub_disk->queue, - ELEVATOR_F_ZBD_SEQ_WRITE); - disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); - disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); - blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); - ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); - - return 0; } /* Based on virtblk_alloc_report_buffer */ @@ -317,7 +337,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, unsigned int zones_in_request = min_t(unsigned int, remaining_zones, max_zones_per_request); struct request *req; - struct ublk_rq_data *pdu; + struct ublk_zoned_report_desc desc; blk_status_t status; memset(buffer, 0, buffer_length); @@ -328,20 +348,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, goto out; } - pdu = blk_mq_rq_to_pdu(req); - pdu->operation = UBLK_IO_OP_REPORT_ZONES; - pdu->sector = sector; - pdu->nr_zones = zones_in_request; + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, GFP_KERNEL); - if (ret) { - blk_mq_free_request(req); - goto out; - } + if (ret) + goto erase_desc; status = blk_execute_rq(req, 0); ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: blk_mq_free_request(req); if (ret) goto out; @@ -375,7 +398,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + struct ublk_zoned_report_desc *desc; u32 ublk_op; switch (req_op(req)) { @@ -398,12 +421,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - ublk_op = pdu->operation; + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; switch (ublk_op) { case UBLK_IO_OP_REPORT_ZONES: iod->op_flags = ublk_op | ublk_req_build_flags(req); - iod->nr_zones = pdu->nr_zones; - iod->start_sector = pdu->sector; + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; return BLK_STS_OK; default: return BLK_STS_IOERR; @@ -432,9 +458,8 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) return -EOPNOTSUPP; } -static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - return -EOPNOTSUPP; } static int ublk_revalidate_disk_zones(struct ublk_device *ub) @@ -495,44 +520,14 @@ static inline unsigned ublk_pos_to_tag(loff_t pos) static void ublk_dev_param_basic_apply(struct ublk_device *ub) { - struct request_queue *q = ub->ub_disk->queue; const struct ublk_param_basic *p = &ub->params.basic; - blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); - blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); - blk_queue_io_min(q, 1 << p->io_min_shift); - blk_queue_io_opt(q, 1 << p->io_opt_shift); - - blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, - p->attrs & UBLK_ATTR_FUA); - if (p->attrs & UBLK_ATTR_ROTATIONAL) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - - blk_queue_max_hw_sectors(q, p->max_sectors); - blk_queue_chunk_sectors(q, p->chunk_sectors); - blk_queue_virt_boundary(q, p->virt_boundary_mask); - if (p->attrs & UBLK_ATTR_READ_ONLY) set_disk_ro(ub->ub_disk, true); set_capacity(ub->ub_disk, p->dev_sectors); } -static void ublk_dev_param_discard_apply(struct ublk_device *ub) -{ - struct request_queue *q = ub->ub_disk->queue; - const struct ublk_param_discard *p = &ub->params.discard; - - q->limits.discard_alignment = p->discard_alignment; - q->limits.discard_granularity = p->discard_granularity; - blk_queue_max_discard_sectors(q, p->max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, - p->max_write_zeroes_sectors); - blk_queue_max_discard_segments(q, p->max_discard_segments); -} - static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -576,20 +571,12 @@ static int ublk_validate_params(const struct ublk_device *ub) return 0; } -static int ublk_apply_params(struct ublk_device *ub) +static void ublk_apply_params(struct ublk_device *ub) { - if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) - return -EINVAL; - ublk_dev_param_basic_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) - ublk_dev_param_discard_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_ZONED) - return ublk_dev_param_zoned_apply(ub); - - return 0; + ublk_dev_param_zoned_apply(ub); } static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) @@ -645,14 +632,16 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_NEED_GET_DATA; } -static struct ublk_device *ublk_get_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) { if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) return ub; return NULL; } -static void ublk_put_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline void ublk_put_device(struct ublk_device *ub) { put_device(&ub->cdev_dev); } @@ -680,30 +669,69 @@ static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) return ublk_get_queue(ub, q_id)->io_cmd_buf; } +static inline int __ublk_queue_cmd_buf_size(int depth) +{ + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); +} + static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), - PAGE_SIZE); + return __ublk_queue_cmd_buf_size(ubq->q_depth); +} + +static int ublk_max_cmd_buf_size(void) +{ + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); +} + +/* + * Should I/O outstanding to the ublk server when it exits be reissued? + * If not, outstanding I/O will get errors. + */ +static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) +{ + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); +} + +/* + * Should I/O issued while there is no ublk server queue? If not, I/O + * issued while there is no ublk server will get errors. + */ +static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) +{ + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); } -static inline bool ublk_queue_can_use_recovery_reissue( - struct ublk_queue *ubq) +/* + * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy + * of the device flags for smaller cache footprint - better for fast + * paths. + */ +static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) { return (ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); + !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); } -static inline bool ublk_queue_can_use_recovery( - struct ublk_queue *ubq) +/* + * Should ublk devices be stopped (i.e. no recovery possible) when the + * ublk server exits? If not, devices can be used again by a future + * incarnation of a ublk server via the start_recovery/end_recovery + * commands. + */ +static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) { - return ubq->flags & UBLK_F_USER_RECOVERY; + return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); } -static inline bool ublk_can_use_recovery(struct ublk_device *ub) +static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_RECOVERY; + return ub->dev_info.state == UBLK_S_DEV_QUIESCED || + ub->dev_info.state == UBLK_S_DEV_FAIL_IO; } static void ublk_free_disk(struct gendisk *disk) @@ -711,7 +739,7 @@ static void ublk_free_disk(struct gendisk *disk) struct ublk_device *ub = disk->private_data; clear_bit(UB_STATE_USED, &ub->state); - put_device(&ub->cdev_dev); + ublk_put_device(ub); } static void ublk_store_owner_uid_gid(unsigned int *owner_uid, @@ -1079,7 +1107,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_queue_can_use_recovery_reissue(ubq)) + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) blk_mq_requeue_request(req, false); else ublk_put_req_ref(ubq, req); @@ -1107,7 +1135,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, struct request *rq) { /* We cannot process this rq so just requeue it. */ - if (ublk_queue_can_use_recovery(ubq)) + if (ublk_nosrv_dev_should_queue_io(ubq->dev)) blk_mq_requeue_request(rq, false); else blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1252,10 +1280,7 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) struct ublk_device *ub = ubq->dev; if (ublk_abort_requests(ub, ubq)) { - if (ublk_can_use_recovery(ub)) - schedule_work(&ub->quiesce_work); - else - schedule_work(&ub->stop_work); + schedule_work(&ub->nosrv_work); } return BLK_EH_DONE; } @@ -1270,6 +1295,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; blk_status_t res; + if (unlikely(ubq->fail_io)) { + return BLK_STS_TARGET; + } + /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) @@ -1284,7 +1313,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; if (unlikely(ubq->canceling)) { @@ -1338,7 +1367,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) { struct ublk_device *ub = filp->private_data; size_t sz = vma->vm_end - vma->vm_start; - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned max_sz = ublk_max_cmd_buf_size(); unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; @@ -1505,10 +1534,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, io, issue_flags); if (need_schedule) { - if (ublk_can_use_recovery(ub)) - schedule_work(&ub->quiesce_work); - else - schedule_work(&ub->stop_work); + schedule_work(&ub->nosrv_work); } } @@ -1571,20 +1597,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) ub->dev_info.state = UBLK_S_DEV_QUIESCED; } -static void ublk_quiesce_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, quiesce_work); - - mutex_lock(&ub->mutex); - if (ub->dev_info.state != UBLK_S_DEV_LIVE) - goto unlock; - __ublk_quiesce_dev(ub); - unlock: - mutex_unlock(&ub->mutex); - ublk_cancel_dev(ub); -} - static void ublk_unquiesce_dev(struct ublk_device *ub) { int i; @@ -1606,6 +1618,21 @@ static void ublk_unquiesce_dev(struct ublk_device *ub) blk_mq_kick_requeue_list(ub->ub_disk->queue); } +static struct gendisk *ublk_detach_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ub->ub_disk = NULL; + spin_unlock(&ub->lock); + + return disk; +} + static void ublk_stop_dev(struct ublk_device *ub) { struct gendisk *disk; @@ -1613,26 +1640,50 @@ static void ublk_stop_dev(struct ublk_device *ub) mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) goto unlock; - if (ublk_can_use_recovery(ub)) { + if (ublk_nosrv_dev_should_queue_io(ub)) { if (ub->dev_info.state == UBLK_S_DEV_LIVE) __ublk_quiesce_dev(ub); ublk_unquiesce_dev(ub); } del_gendisk(ub->ub_disk); - - /* Sync with ublk_abort_queue() by holding the lock */ - spin_lock(&ub->lock); - disk = ub->ub_disk; - ub->dev_info.state = UBLK_S_DEV_DEAD; - ub->dev_info.ublksrv_pid = -1; - ub->ub_disk = NULL; - spin_unlock(&ub->lock); + disk = ublk_detach_disk(ub); put_disk(disk); unlock: mutex_unlock(&ub->mutex); ublk_cancel_dev(ub); } +static void ublk_nosrv_work(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, nosrv_work); + int i; + + if (ublk_nosrv_should_stop_dev(ub)) { + ublk_stop_dev(ub); + return; + } + + mutex_lock(&ub->mutex); + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto unlock; + + if (ublk_nosrv_dev_should_queue_io(ub)) { + __ublk_quiesce_dev(ub); + } else { + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ublk_get_queue(ub, i)->fail_io = true; + } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + } + + unlock: + mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); +} + /* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) { @@ -1999,7 +2050,6 @@ static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, .release = ublk_ch_release, - .llseek = no_llseek, .read_iter = ublk_ch_read_iter, .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, @@ -2147,14 +2197,6 @@ static int ublk_add_chdev(struct ublk_device *ub) return ret; } -static void ublk_stop_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, stop_work); - - ublk_stop_dev(ub); -} - /* align max io buffer size with PAGE_SIZE */ static void ublk_align_max_io_size(struct ublk_device *ub) { @@ -2171,7 +2213,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); - ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } @@ -2179,10 +2220,9 @@ static int ublk_add_tag_set(struct ublk_device *ub) static void ublk_remove(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); + cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); - put_device(&ub->cdev_dev); + ublk_put_device(ub); ublks_added--; } @@ -2205,12 +2245,58 @@ static struct ublk_device *ublk_get_device_from_id(int idx) static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + const struct ublk_param_basic *p = &ub->params.basic; int ublksrv_pid = (int)header->data[0]; + struct queue_limits lim = { + .logical_block_size = 1 << p->logical_bs_shift, + .physical_block_size = 1 << p->physical_bs_shift, + .io_min = 1 << p->io_min_shift, + .io_opt = 1 << p->io_opt_shift, + .max_hw_sectors = p->max_sectors, + .chunk_sectors = p->chunk_sectors, + .virt_boundary_mask = p->virt_boundary_mask, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + .dma_alignment = 3, + }; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { + const struct ublk_param_discard *pd = &ub->params.discard; + + lim.discard_alignment = pd->discard_alignment; + lim.discard_granularity = pd->discard_granularity; + lim.max_hw_discard_sectors = pd->max_discard_sectors; + lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; + lim.max_discard_segments = pd->max_discard_segments; + } + + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { + const struct ublk_param_zoned *p = &ub->params.zoned; + + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return -EOPNOTSUPP; + + lim.features |= BLK_FEAT_ZONED; + lim.max_active_zones = p->max_active_zones; + lim.max_open_zones = p->max_open_zones; + lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; + } + + if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (ub->params.basic.attrs & UBLK_ATTR_FUA) + lim.features |= BLK_FEAT_FUA; + } + + if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -2222,7 +2308,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, NULL); + disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; @@ -2234,15 +2320,13 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) ub->dev_info.ublksrv_pid = ublksrv_pid; ub->ub_disk = disk; - ret = ublk_apply_params(ub); - if (ret) - goto out_put_disk; + ublk_apply_params(ub); /* don't probe partitions if any one ubq daemon is un-trusted */ if (ub->nr_privileged_daemon != ub->nr_queues_ready) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); - get_device(&ub->cdev_dev); + ublk_get_device(ub); ub->dev_info.state = UBLK_S_DEV_LIVE; if (ublk_dev_is_zoned(ub)) { @@ -2259,10 +2343,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) out_put_cdev: if (ret) { - ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_detach_disk(ub); ublk_put_device(ub); } -out_put_disk: if (ret) put_disk(disk); out_unlock: @@ -2346,6 +2429,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) return -EPERM; + /* forbid nonsense combinations of recovery flags */ + switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { + case 0: + case UBLK_F_USER_RECOVERY: + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): + break; + default: + pr_warn("%s: invalid recovery flags %llx\n", __func__, + info.flags & UBLK_F_ALL_RECOVERY_FLAGS); + return -EINVAL; + } + /* * unprivileged device can't be trusted, but RECOVERY and * RECOVERY_REISSUE still may hang error handling, so can't @@ -2354,10 +2450,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * TODO: provide forward progress for RECOVERY handler, so that * unprivileged device can benefit from it */ - if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | UBLK_F_USER_RECOVERY); + /* + * For USER_COPY, we depends on userspace to fill request + * buffer by pwrite() to ublk char device, which can't be + * used for unprivileged device + */ + if (info.flags & UBLK_F_USER_COPY) + return -EINVAL; + } + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -2389,8 +2494,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); - INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); - INIT_WORK(&ub->stop_work, ublk_stop_work_fn); + INIT_WORK(&ub->nosrv_work, ublk_nosrv_work); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2474,7 +2578,7 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(struct ublk_device **p_ub) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) { struct ublk_device *ub = *p_ub; int idx = ub->ub_number; @@ -2508,7 +2612,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) * - the device number is freed already, we will not find this * device via ublk_get_device_from_id() */ - if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) return -EINTR; return 0; } @@ -2525,9 +2629,7 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) static int ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); - + cancel_work_sync(&ub->nosrv_work); return 0; } @@ -2613,9 +2715,12 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; @@ -2664,7 +2769,9 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, int i; mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) + goto out_unlock; + if (!ub->nr_queues_ready) goto out_unlock; /* * START_RECOVERY is only allowd after: @@ -2673,14 +2780,18 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, * and related io_uring ctx is freed so file struct of /dev/ublkcX is * released. * + * and one of the following holds + * * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: * (a)has quiesced request queue * (b)has requeued every inflight rqs whose io_flags is ACTIVE * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE * (d)has completed/camceled all ioucmds owned by ther dying process + * + * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not + * quiesced, but all I/O is being immediately errored */ - if (test_bit(UB_STATE_OPEN, &ub->state) || - ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } @@ -2704,6 +2815,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; + int i; pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); @@ -2715,21 +2827,32 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, __func__, ub->dev_info.nr_hw_queues, header->dev_id); mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; - if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (!ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } ub->dev_info.ublksrv_pid = ublksrv_pid; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); - blk_mq_kick_requeue_list(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_nosrv_dev_should_queue_io(ub)) { + ub->dev_info.state = UBLK_S_DEV_LIVE; + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); + } else { + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ublk_get_queue(ub, i)->fail_io = false; + } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + } + ret = 0; out_unlock: mutex_unlock(&ub->mutex); @@ -2907,7 +3030,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(&ub); + ret = ublk_ctrl_del_dev(&ub, true); + break; + case UBLK_CMD_DEL_DEV_ASYNC: + ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: ret = ublk_ctrl_get_queue_affinity(ub, cmd); @@ -2925,7 +3051,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_end_recovery(ub, cmd); break; default: - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; break; } @@ -3019,4 +3145,5 @@ module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); +MODULE_DESCRIPTION("Userspace block device"); MODULE_LICENSE("GPL"); |