summaryrefslogtreecommitdiff
path: root/drivers/block/ublk_drv.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/ublk_drv.c')
-rw-r--r--drivers/block/ublk_drv.c578
1 files changed, 356 insertions, 222 deletions
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 9fd284fa76dc..6561d2a561fa 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -48,6 +48,8 @@
#define UBLK_MINORS (1U << MINORBITS)
+#define UBLK_INVALID_BUF_IDX ((u16)-1)
+
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
@@ -70,7 +72,8 @@
| UBLK_F_UPDATE_SIZE \
| UBLK_F_AUTO_BUF_REG \
| UBLK_F_QUIESCE \
- | UBLK_F_PER_IO_DAEMON)
+ | UBLK_F_PER_IO_DAEMON \
+ | UBLK_F_BUF_REG_OFF_DAEMON)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -82,14 +85,6 @@
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
-struct ublk_rq_data {
- refcount_t ref;
-
- /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
- u16 buf_index;
- void *buf_ctx_handle;
-};
-
struct ublk_uring_cmd_pdu {
/*
* Store requests in same batch temporarily for queuing them to
@@ -110,8 +105,6 @@ struct ublk_uring_cmd_pdu {
*/
struct ublk_queue *ubq;
- struct ublk_auto_buf_reg buf;
-
u16 tag;
};
@@ -155,9 +148,19 @@ struct ublk_uring_cmd_pdu {
/* atomic RW with ubq->cancel_lock */
#define UBLK_IO_FLAG_CANCELED 0x80000000
+/*
+ * Initialize refcount to a large number to include any registered buffers.
+ * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
+ * any buffers registered on the io daemon task.
+ */
+#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
+
struct ublk_io {
/* userspace buffer address from io cmd */
- __u64 addr;
+ union {
+ __u64 addr;
+ struct ublk_auto_buf_reg buf;
+ };
unsigned int flags;
int res;
@@ -169,7 +172,24 @@ struct ublk_io {
};
struct task_struct *task;
-};
+
+ /*
+ * The number of uses of this I/O by the ublk server
+ * if user copy or zero copy are enabled:
+ * - UBLK_REFCOUNT_INIT from dispatch to the server
+ * until UBLK_IO_COMMIT_AND_FETCH_REQ
+ * - 1 for each inflight ublk_ch_{read,write}_iter() call
+ * - 1 for each io_uring registered buffer not registered on task
+ * The I/O can only be completed once all references are dropped.
+ * User copy and buffer registration operations are only permitted
+ * if the reference count is nonzero.
+ */
+ refcount_t ref;
+ /* Count of buffers registered on task and not yet unregistered */
+ unsigned task_registered_buffers;
+
+ void *buf_ctx_handle;
+} ____cacheline_aligned_in_smp;
struct ublk_queue {
int q_id;
@@ -216,6 +236,9 @@ struct ublk_device {
struct completion completion;
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
+ struct mutex cancel_mutex;
+ bool canceling;
+ pid_t ublksrv_tgid;
};
/* header of ublk_params */
@@ -228,7 +251,8 @@ static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, int tag, size_t offset);
+ const struct ublk_queue *ubq, struct ublk_io *io,
+ size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *
@@ -673,38 +697,29 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+ struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- refcount_set(&data->ref, 1);
- }
+ if (ublk_need_req_ref(ubq))
+ refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
}
-static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+static inline bool ublk_get_req_ref(struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- return refcount_inc_not_zero(&data->ref);
- }
+ return refcount_inc_not_zero(&io->ref);
+}
- return true;
+static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
+{
+ if (refcount_dec_and_test(&io->ref))
+ __ublk_complete_rq(req);
}
-static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
- struct request *req)
+static inline bool ublk_sub_req_ref(struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
- if (refcount_dec_and_test(&data->ref))
- __ublk_complete_rq(req);
- } else {
- __ublk_complete_rq(req);
- }
+ io->task_registered_buffers = 0;
+ return refcount_sub_and_test(sub_refs, &io->ref);
}
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
@@ -981,7 +996,7 @@ static inline bool ublk_need_unmap_req(const struct request *req)
}
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
- struct ublk_io *io)
+ const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
@@ -1005,7 +1020,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
static int ublk_unmap_io(const struct ublk_queue *ubq,
const struct request *req,
- struct ublk_io *io)
+ const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
@@ -1140,7 +1155,7 @@ static inline void __ublk_complete_rq(struct request *req)
if (blk_update_request(req, BLK_STS_OK, io->res))
blk_mq_requeue_request(req, true);
- else
+ else if (likely(!blk_should_fake_timeout(req->q)))
__blk_mq_end_request(req, BLK_STS_OK);
return;
@@ -1188,39 +1203,33 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
-static void ublk_auto_buf_reg_fallback(struct request *req)
+static void
+ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
{
- const struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ unsigned tag = io - ubq->ios;
+ struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
- refcount_set(&data->ref, 1);
}
-static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
- unsigned int issue_flags)
+static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io, unsigned int issue_flags)
{
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
int ret;
ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
- pdu->buf.index, issue_flags);
+ io->buf.index, issue_flags);
if (ret) {
- if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
- ublk_auto_buf_reg_fallback(req);
+ if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(ubq, io);
return true;
}
blk_mq_end_request(req, BLK_STS_IOERR);
return false;
}
- /* one extra reference is dropped by ublk_io_release */
- refcount_set(&data->ref, 2);
- data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
- /* store buffer index in request payload */
- data->buf_index = pdu->buf.index;
+ io->task_registered_buffers = 1;
+ io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
return true;
}
@@ -1229,10 +1238,10 @@ static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
struct request *req, struct ublk_io *io,
unsigned int issue_flags)
{
+ ublk_init_req_ref(ubq, io);
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
- return ublk_auto_buf_reg(req, io, issue_flags);
+ return ublk_auto_buf_reg(ubq, req, io, issue_flags);
- ublk_init_req_ref(ubq, req);
return true;
}
@@ -1356,14 +1365,23 @@ static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
- struct ublk_io *io = &ubq->ios[rq->tag];
+ pid_t tgid = ubq->dev->ublksrv_tgid;
+ struct task_struct *p;
+ struct pid *pid;
- if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
- send_sig(SIGKILL, io->task, 0);
- return BLK_EH_DONE;
- }
+ if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
+ return BLK_EH_RESET_TIMER;
- return BLK_EH_RESET_TIMER;
+ if (unlikely(!tgid))
+ return BLK_EH_RESET_TIMER;
+
+ rcu_read_lock();
+ pid = find_vpid(tgid);
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ send_sig(SIGKILL, p, 0);
+ rcu_read_unlock();
+ return BLK_EH_DONE;
}
static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
@@ -1504,6 +1522,9 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
put_task_struct(io->task);
io->task = NULL;
}
+
+ WARN_ON_ONCE(refcount_read(&io->ref));
+ WARN_ON_ONCE(io->task_registered_buffers);
}
}
@@ -1515,6 +1536,7 @@ static int ublk_ch_open(struct inode *inode, struct file *filp)
if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
return -EBUSY;
filp->private_data = ub;
+ ub->ublksrv_tgid = current->tgid;
return 0;
}
@@ -1529,6 +1551,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
ub->mm = NULL;
ub->nr_queues_ready = 0;
ub->nr_privileged_daemon = 0;
+ ub->ublksrv_tgid = -1;
}
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
@@ -1550,6 +1573,27 @@ static void ublk_put_disk(struct gendisk *disk)
put_device(disk_to_dev(disk));
}
+/*
+ * Use this function to ensure that ->canceling is consistently set for
+ * the device and all queues. Do not set these flags directly.
+ *
+ * Caller must ensure that:
+ * - cancel_mutex is held. This ensures that there is no concurrent
+ * access to ub->canceling and no concurrent writes to ubq->canceling.
+ * - there are no concurrent reads of ubq->canceling from the queue_rq
+ * path. This can be done by quiescing the queue, or through other
+ * means.
+ */
+static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
+ __must_hold(&ub->cancel_mutex)
+{
+ int i;
+
+ ub->canceling = canceling;
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_get_queue(ub, i)->canceling = canceling;
+}
+
static int ublk_ch_release(struct inode *inode, struct file *filp)
{
struct ublk_device *ub = filp->private_data;
@@ -1578,12 +1622,11 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* All requests may be inflight, so ->canceling may not be set, set
* it now.
*/
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
-
- ubq->canceling = true;
- ublk_abort_queue(ub, ubq);
- }
+ mutex_lock(&ub->cancel_mutex);
+ ublk_set_canceling(ub, true);
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_abort_queue(ub, ublk_get_queue(ub, i));
+ mutex_unlock(&ub->cancel_mutex);
blk_mq_kick_requeue_list(disk->queue);
/*
@@ -1706,23 +1749,17 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
}
}
-/* Must be called when queue is frozen */
-static void ublk_mark_queue_canceling(struct ublk_queue *ubq)
+static void ublk_start_cancel(struct ublk_device *ub)
{
- spin_lock(&ubq->cancel_lock);
- if (!ubq->canceling)
- ubq->canceling = true;
- spin_unlock(&ubq->cancel_lock);
-}
-
-static void ublk_start_cancel(struct ublk_queue *ubq)
-{
- struct ublk_device *ub = ubq->dev;
struct gendisk *disk = ublk_get_disk(ub);
/* Our disk has been dead */
if (!disk)
return;
+
+ mutex_lock(&ub->cancel_mutex);
+ if (ub->canceling)
+ goto out;
/*
* Now we are serialized with ublk_queue_rq()
*
@@ -1731,8 +1768,10 @@ static void ublk_start_cancel(struct ublk_queue *ubq)
* touch completed uring_cmd
*/
blk_mq_quiesce_queue(disk->queue);
- ublk_mark_queue_canceling(ubq);
+ ublk_set_canceling(ub, true);
blk_mq_unquiesce_queue(disk->queue);
+out:
+ mutex_unlock(&ub->cancel_mutex);
ublk_put_disk(disk);
}
@@ -1805,8 +1844,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
if (WARN_ON_ONCE(task && task != io->task))
return;
- if (!ubq->canceling)
- ublk_start_cancel(ubq);
+ ublk_start_cancel(ubq->dev);
WARN_ON_ONCE(io->cmd != cmd);
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
@@ -1930,9 +1968,11 @@ static void ublk_reset_io_flags(struct ublk_device *ub)
for (j = 0; j < ubq->q_depth; j++)
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
spin_unlock(&ubq->cancel_lock);
- ubq->canceling = false;
ubq->fail_io = false;
}
+ mutex_lock(&ub->cancel_mutex);
+ ublk_set_canceling(ub, false);
+ mutex_unlock(&ub->cancel_mutex);
}
/* device can only be started after all IOs are ready */
@@ -1967,12 +2007,66 @@ static inline int ublk_check_cmd_op(u32 cmd_op)
return 0;
}
-static inline void ublk_fill_io_cmd(struct ublk_io *io,
- struct io_uring_cmd *cmd, unsigned long buf_addr)
+static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
+{
+ io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (io->buf.reserved0 || io->buf.reserved1)
+ return -EINVAL;
+
+ if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ return -EINVAL;
+ return 0;
+}
+
+static int ublk_handle_auto_buf_reg(struct ublk_io *io,
+ struct io_uring_cmd *cmd,
+ u16 *buf_idx)
{
+ if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
+ io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
+
+ /*
+ * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
+ * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
+ * `io_ring_ctx`.
+ *
+ * If this uring_cmd's io_ring_ctx isn't same with the
+ * one for registering the buffer, it is ublk server's
+ * responsibility for unregistering the buffer, otherwise
+ * this ublk request gets stuck.
+ */
+ if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
+ *buf_idx = io->buf.index;
+ }
+
+ return ublk_set_auto_buf_reg(io, cmd);
+}
+
+/* Once we return, `io->req` can't be used any more */
+static inline struct request *
+ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
+{
+ struct request *req = io->req;
+
io->cmd = cmd;
io->flags |= UBLK_IO_FLAG_ACTIVE;
+ /* now this cmd slot is owned by ublk driver */
+ io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+
+ return req;
+}
+
+static inline int
+ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
+ struct io_uring_cmd *cmd, unsigned long buf_addr,
+ u16 *buf_idx)
+{
+ if (ublk_support_auto_buf_reg(ubq))
+ return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
+
io->addr = buf_addr;
+ return 0;
}
static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
@@ -1990,30 +2084,25 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
-static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
-{
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
-
- if (pdu->buf.reserved0 || pdu->buf.reserved1)
- return -EINVAL;
-
- if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
- return -EINVAL;
- return 0;
-}
-
static void ublk_io_release(void *priv)
{
struct request *rq = priv;
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+ struct ublk_io *io = &ubq->ios[rq->tag];
- ublk_put_req_ref(ubq, rq);
+ /*
+ * task_registered_buffers may be 0 if buffers were registered off task
+ * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
+ */
+ if (current == io->task && io->task_registered_buffers)
+ io->task_registered_buffers--;
+ else
+ ublk_put_req_ref(io, rq);
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, unsigned int tag,
+ const struct ublk_queue *ubq,
+ struct ublk_io *io,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
@@ -2023,30 +2112,75 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- req = __ublk_check_and_get_req(ub, ubq, tag, 0);
+ req = __ublk_check_and_get_req(ub, ubq, io, 0);
if (!req)
return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
issue_flags);
if (ret) {
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return ret;
}
return 0;
}
+static int
+ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
+ const struct ublk_queue *ubq, struct ublk_io *io,
+ unsigned index, unsigned issue_flags)
+{
+ unsigned new_registered_buffers;
+ struct request *req = io->req;
+ int ret;
+
+ /*
+ * Ensure there are still references for ublk_sub_req_ref() to release.
+ * If not, fall back on the thread-safe buffer registration.
+ */
+ new_registered_buffers = io->task_registered_buffers + 1;
+ if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
+ return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
+
+ if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
+ return -EINVAL;
+
+ ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
+ issue_flags);
+ if (ret)
+ return ret;
+
+ io->task_registered_buffers = new_registered_buffers;
+ return 0;
+}
+
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq,
+ const struct ublk_device *ub,
unsigned int index, unsigned int issue_flags)
{
- if (!ublk_support_zero_copy(ubq))
+ if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
return -EINVAL;
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
+static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
+{
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * FETCH_RQ has to provide IO buffer if NEED GET
+ * DATA is not enabled
+ */
+ if (!buf_addr && !ublk_need_get_data(ubq))
+ return -EINVAL;
+ } else if (buf_addr) {
+ /* User copy requires addr to be unset */
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
struct ublk_io *io, __u64 buf_addr)
{
@@ -2073,26 +2207,11 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
- if (ublk_need_map_io(ubq)) {
- /*
- * FETCH_RQ has to provide IO buffer if NEED GET
- * DATA is not enabled
- */
- if (!buf_addr && !ublk_need_get_data(ubq))
- goto out;
- } else if (buf_addr) {
- /* User copy requires addr to be unset */
- ret = -EINVAL;
+ ublk_fill_io_cmd(io, cmd);
+ ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
+ if (ret)
goto out;
- }
- if (ublk_support_auto_buf_reg(ubq)) {
- ret = ublk_set_auto_buf_reg(cmd);
- if (ret)
- goto out;
- }
-
- ublk_fill_io_cmd(io, cmd, buf_addr);
WRITE_ONCE(io->task, get_task_struct(current));
ublk_mark_io_ready(ub, ubq);
out:
@@ -2100,10 +2219,8 @@ out:
return ret;
}
-static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
- struct ublk_io *io, struct io_uring_cmd *cmd,
- const struct ublksrv_io_cmd *ub_cmd,
- unsigned int issue_flags)
+static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
+ struct ublk_io *io, __u64 buf_addr)
{
struct request *req = io->req;
@@ -2112,10 +2229,10 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
*/
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ if (!buf_addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ))
return -EINVAL;
- } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+ } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
/*
* User copy requires addr to be unset when command is
* not zone append
@@ -2123,48 +2240,17 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
return -EINVAL;
}
- if (ublk_support_auto_buf_reg(ubq)) {
- int ret;
-
- /*
- * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
- * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
- * `io_ring_ctx`.
- *
- * If this uring_cmd's io_ring_ctx isn't same with the
- * one for registering the buffer, it is ublk server's
- * responsibility for unregistering the buffer, otherwise
- * this ublk request gets stuck.
- */
- if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
- io_buffer_unregister_bvec(cmd, data->buf_index,
- issue_flags);
- io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
- }
-
- ret = ublk_set_auto_buf_reg(cmd);
- if (ret)
- return ret;
- }
-
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
-
- /* now this cmd slot is owned by ublk driver */
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
- io->res = ub_cmd->result;
-
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
-
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
-
return 0;
}
+static bool ublk_need_complete_req(const struct ublk_queue *ubq,
+ struct ublk_io *io)
+{
+ if (ublk_need_req_ref(ubq))
+ return ublk_sub_req_ref(io);
+ return true;
+}
+
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
@@ -2187,19 +2273,33 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
{
+ u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
- struct task_struct *task;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
- int ret = -EINVAL;
struct request *req;
+ int ret;
+ bool compl;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
ub_cmd->result);
+ ret = ublk_check_cmd_op(cmd_op);
+ if (ret)
+ goto out;
+
+ /*
+ * io_buffer_unregister_bvec() doesn't access the ubq or io,
+ * so no need to validate the q_id, tag, or task
+ */
+ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
+ return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
+ issue_flags);
+
+ ret = -EINVAL;
if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
goto out;
@@ -2209,21 +2309,37 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
io = &ubq->ios[tag];
- task = READ_ONCE(io->task);
- if (task && task != current)
+ /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
+ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
+ ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
+ if (ret)
+ goto out;
+ ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ if (ret)
+ goto out;
+
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
+ return -EIOCBQUEUED;
+ }
+
+ if (READ_ONCE(io->task) != current) {
+ /*
+ * ublk_register_io_buf() accesses only the io's refcount,
+ * so can be handled on any task
+ */
+ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
+ return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ issue_flags);
+
goto out;
+ }
/* there is pending io cmd, something must be wrong */
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
ret = -EBUSY;
goto out;
}
- /* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
- _IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
- goto out;
-
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
@@ -2232,23 +2348,27 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
- ret = ublk_check_cmd_op(cmd_op);
- if (ret)
- goto out;
-
- ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
- return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
- case UBLK_IO_UNREGISTER_IO_BUF:
- return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
- case UBLK_IO_FETCH_REQ:
- ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ issue_flags);
+ case UBLK_IO_COMMIT_AND_FETCH_REQ:
+ ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
if (ret)
goto out;
- break;
- case UBLK_IO_COMMIT_AND_FETCH_REQ:
- ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
+ io->res = ub_cmd->result;
+ req = ublk_fill_io_cmd(io, cmd);
+ ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
+ compl = ublk_need_complete_req(ubq, io);
+
+ /* can't touch 'ublk_io' any more */
+ if (buf_idx != UBLK_INVALID_BUF_IDX)
+ io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = ub_cmd->zone_append_lba;
+ if (compl)
+ __ublk_complete_rq(req);
+
if (ret)
goto out;
break;
@@ -2258,9 +2378,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* uring_cmd active first and prepare for handling new requeued
* request
*/
- req = io->req;
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+ req = ublk_fill_io_cmd(io, cmd);
+ ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
+ WARN_ON_ONCE(ret);
if (likely(ublk_get_data(ubq, io, req))) {
__ublk_prep_compl_io_cmd(io, req);
return UBLK_IO_RES_OK;
@@ -2279,15 +2399,20 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, int tag, size_t offset)
+ const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
{
+ unsigned tag = io - ubq->ios;
struct request *req;
+ /*
+ * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
+ * which would overwrite it with io->cmd
+ */
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
if (!req)
return NULL;
- if (!ublk_get_req_ref(ubq, req))
+ if (!ublk_get_req_ref(io))
return NULL;
if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
@@ -2301,7 +2426,7 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
return req;
fail_put:
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return NULL;
}
@@ -2368,7 +2493,8 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
}
static struct request *ublk_check_and_get_req(struct kiocb *iocb,
- struct iov_iter *iter, size_t *off, int dir)
+ struct iov_iter *iter, size_t *off, int dir,
+ struct ublk_io **io)
{
struct ublk_device *ub = iocb->ki_filp->private_data;
struct ublk_queue *ubq;
@@ -2402,7 +2528,8 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
if (tag >= ubq->q_depth)
return ERR_PTR(-EINVAL);
- req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
+ *io = &ubq->ios[tag];
+ req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
if (!req)
return ERR_PTR(-EINVAL);
@@ -2415,42 +2542,40 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
*off = buf_off;
return req;
fail:
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(*io, req);
return ERR_PTR(-EACCES);
}
static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct ublk_queue *ubq;
struct request *req;
+ struct ublk_io *io;
size_t buf_off;
size_t ret;
- req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
+ req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io);
if (IS_ERR(req))
return PTR_ERR(req);
ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return ret;
}
static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct ublk_queue *ubq;
struct request *req;
+ struct ublk_io *io;
size_t buf_off;
size_t ret;
- req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
+ req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io);
if (IS_ERR(req))
return PTR_ERR(req);
ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
+ ublk_put_req_ref(io, req);
return ret;
}
@@ -2475,6 +2600,8 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
struct ublk_io *io = &ubq->ios[i];
if (io->task)
put_task_struct(io->task);
+ WARN_ON_ONCE(refcount_read(&io->ref));
+ WARN_ON_ONCE(io->task_registered_buffers);
}
if (ubq->io_cmd_buf)
@@ -2513,7 +2640,7 @@ static void ublk_deinit_queues(struct ublk_device *ub)
for (i = 0; i < nr_queues; i++)
ublk_deinit_queue(ub, i);
- kfree(ub->__queues);
+ kvfree(ub->__queues);
}
static int ublk_init_queues(struct ublk_device *ub)
@@ -2524,7 +2651,7 @@ static int ublk_init_queues(struct ublk_device *ub)
int i, ret = -ENOMEM;
ub->queue_size = ubq_size;
- ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
+ ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
if (!ub->__queues)
return ret;
@@ -2580,6 +2707,7 @@ static void ublk_cdev_rel(struct device *dev)
ublk_deinit_queues(ub);
ublk_free_dev_number(ub);
mutex_destroy(&ub->mutex);
+ mutex_destroy(&ub->cancel_mutex);
kfree(ub);
}
@@ -2627,7 +2755,6 @@ static int ublk_add_tag_set(struct ublk_device *ub)
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
}
@@ -2729,6 +2856,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
+ if (ub->ublksrv_tgid != ublksrv_pid)
+ return -EINVAL;
+
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
test_bit(UB_STATE_USED, &ub->state)) {
@@ -2933,6 +3063,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
goto out_unlock;
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
+ mutex_init(&ub->cancel_mutex);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2953,7 +3084,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
UBLK_F_URING_CMD_COMP_IN_TASK |
- UBLK_F_PER_IO_DAEMON;
+ UBLK_F_PER_IO_DAEMON |
+ UBLK_F_BUF_REG_OFF_DAEMON;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
@@ -3003,6 +3135,7 @@ out_free_dev_number:
ublk_free_dev_number(ub);
out_free_ub:
mutex_destroy(&ub->mutex);
+ mutex_destroy(&ub->cancel_mutex);
kfree(ub);
out_unlock:
mutex_unlock(&ublk_ctl_mutex);
@@ -3227,6 +3360,9 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
header->dev_id);
+ if (ub->ublksrv_tgid != ublksrv_pid)
+ return -EINVAL;
+
mutex_lock(&ub->mutex);
if (ublk_nosrv_should_stop_dev(ub))
goto out_unlock;
@@ -3340,7 +3476,7 @@ static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
/* zero means wait forever */
u64 timeout_ms = header->data[0];
struct gendisk *disk;
- int i, ret = -ENODEV;
+ int ret = -ENODEV;
if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
return -EOPNOTSUPP;
@@ -3357,14 +3493,12 @@ static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
goto put_disk;
- /* Mark all queues as canceling */
+ /* Mark the device as canceling */
+ mutex_lock(&ub->cancel_mutex);
blk_mq_quiesce_queue(disk->queue);
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
-
- ubq->canceling = true;
- }
+ ublk_set_canceling(ub, true);
blk_mq_unquiesce_queue(disk->queue);
+ mutex_unlock(&ub->cancel_mutex);
if (!timeout_ms)
timeout_ms = UINT_MAX;