summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile1
-rw-r--r--io_uring/cmd_net.c82
-rw-r--r--io_uring/fdinfo.c12
-rw-r--r--io_uring/futex.c11
-rw-r--r--io_uring/io-wq.c4
-rw-r--r--io_uring/io_uring.c104
-rw-r--r--io_uring/io_uring.h31
-rw-r--r--io_uring/kbuf.c23
-rw-r--r--io_uring/kbuf.h6
-rw-r--r--io_uring/mock_file.c363
-rw-r--r--io_uring/msg_ring.c4
-rw-r--r--io_uring/net.c105
-rw-r--r--io_uring/nop.c8
-rw-r--r--io_uring/opdef.c2
-rw-r--r--io_uring/opdef.h1
-rw-r--r--io_uring/openclose.c2
-rw-r--r--io_uring/poll.c46
-rw-r--r--io_uring/poll.h1
-rw-r--r--io_uring/register.c7
-rw-r--r--io_uring/rsrc.c44
-rw-r--r--io_uring/rsrc.h3
-rw-r--r--io_uring/rw.c2
-rw-r--r--io_uring/sqpoll.c49
-rw-r--r--io_uring/sqpoll.h8
-rw-r--r--io_uring/uring_cmd.c93
-rw-r--r--io_uring/uring_cmd.h9
-rw-r--r--io_uring/zcrx.c278
-rw-r--r--io_uring/zcrx.h2
28 files changed, 1020 insertions, 281 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index d97c6b51d584..b3f1bd492804 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
obj-$(CONFIG_NET) += net.o cmd_net.o
obj-$(CONFIG_PROC_FS) += fdinfo.o
+obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index e99170c7d41a..3866fe6ff541 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -1,5 +1,6 @@
#include <asm/ioctls.h>
#include <linux/io_uring/net.h>
+#include <linux/errqueue.h>
#include <net/sock.h>
#include "uring_cmd.h"
@@ -51,6 +52,85 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock,
optlen);
}
+static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk,
+ struct sk_buff *skb, unsigned issue_flags)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ struct io_uring_cqe cqe[2];
+ struct io_timespec *iots;
+ struct timespec64 ts;
+ u32 tstype, tskey;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec));
+
+ ret = skb_get_tx_timestamp(skb, sk, &ts);
+ if (ret < 0)
+ return false;
+
+ tskey = serr->ee.ee_data;
+ tstype = serr->ee.ee_info;
+
+ cqe->user_data = 0;
+ cqe->res = tskey;
+ cqe->flags = IORING_CQE_F_MORE;
+ cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT;
+ if (ret == SOF_TIMESTAMPING_TX_HARDWARE)
+ cqe->flags |= IORING_CQE_F_TSTAMP_HW;
+
+ iots = (struct io_timespec *)&cqe[1];
+ iots->tv_sec = ts.tv_sec;
+ iots->tv_nsec = ts.tv_nsec;
+ return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe);
+}
+
+static int io_uring_cmd_timestamp(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
+ int ret;
+
+ if (!(issue_flags & IO_URING_F_CQE32))
+ return -EINVAL;
+ ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR);
+ if (unlikely(ret))
+ return ret;
+
+ if (skb_queue_empty_lockless(q))
+ return -EAGAIN;
+ __skb_queue_head_init(&list);
+
+ scoped_guard(spinlock_irq, &q->lock) {
+ skb_queue_walk_safe(q, skb, tmp) {
+ /* don't support skbs with payload */
+ if (!skb_has_tx_timestamp(skb, sk) || skb->len)
+ continue;
+ __skb_unlink(skb, q);
+ __skb_queue_tail(&list, skb);
+ }
+ }
+
+ while (1) {
+ skb = skb_peek(&list);
+ if (!skb)
+ break;
+ if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags))
+ break;
+ __skb_dequeue(&list);
+ consume_skb(skb);
+ }
+
+ if (!unlikely(skb_queue_empty(&list))) {
+ scoped_guard(spinlock_irqsave, &q->lock)
+ skb_queue_splice(q, &list);
+ }
+ return -EAGAIN;
+}
+
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
@@ -76,6 +156,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
case SOCKET_URING_OP_SETSOCKOPT:
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_TX_TIMESTAMP:
+ return io_uring_cmd_timestamp(sock, cmd, issue_flags);
default:
return -EOPNOTSUPP;
}
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index e9355276ab5d..9798d6fb4ec7 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -141,18 +141,26 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sq = ctx->sq_data;
+ struct task_struct *tsk;
+ rcu_read_lock();
+ tsk = rcu_dereference(sq->thread);
/*
* sq->thread might be NULL if we raced with the sqpoll
* thread termination.
*/
- if (sq->thread) {
+ if (tsk) {
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ getrusage(tsk, RUSAGE_SELF, &sq_usage);
+ put_task_struct(tsk);
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
- getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+ sq_usage.ru_stime.tv_usec);
sq_work_time = sq->work_time;
+ } else {
+ rcu_read_unlock();
}
}
diff --git a/io_uring/futex.c b/io_uring/futex.c
index fa374afbaa51..692462d50c8c 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -14,10 +14,7 @@
struct io_futex {
struct file *file;
- union {
- u32 __user *uaddr;
- struct futex_waitv __user *uwaitv;
- };
+ void __user *uaddr;
unsigned long futex_val;
unsigned long futex_mask;
unsigned long futexv_owned;
@@ -148,6 +145,8 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
!futex_validate_input(iof->futex_flags, iof->futex_mask))
return -EINVAL;
+ /* Mark as inflight, so file exit cancelation will find it */
+ io_req_track_inflight(req);
return 0;
}
@@ -186,13 +185,15 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!futexv)
return -ENOMEM;
- ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr,
+ ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr,
io_futex_wakev_fn, req);
if (ret) {
kfree(futexv);
return ret;
}
+ /* Mark as inflight, so file exit cancelation will find it */
+ io_req_track_inflight(req);
iof->futexv_owned = 0;
iof->futexv_unqueued = 0;
req->flags |= REQ_F_ASYNC_DATA;
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index cd1fcb115739..be91edf34f01 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1259,8 +1259,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done);
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- if (ret)
+ if (ret) {
+ put_task_struct(wq->task);
goto err;
+ }
return wq;
err:
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c7a9cecf528e..4ef69dd58734 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -114,11 +114,11 @@
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
- REQ_F_ASYNC_DATA)
+ REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \
- REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
+ REQ_F_REISSUE | REQ_F_POLLED | \
+ IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -147,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all,
bool is_sqpoll_thread);
-static void io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags);
static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -392,11 +392,6 @@ static void io_clean_op(struct io_kiocb *req)
if (def->cleanup)
def->cleanup(req);
}
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- kfree(req->apoll->double_poll);
- kfree(req->apoll);
- req->apoll = NULL;
- }
if (req->flags & REQ_F_INFLIGHT)
atomic_dec(&req->tctx->inflight_tracked);
if (req->flags & REQ_F_CREDS)
@@ -408,7 +403,12 @@ static void io_clean_op(struct io_kiocb *req)
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
-static inline void io_req_track_inflight(struct io_kiocb *req)
+/*
+ * Mark the request as inflight, so that file cancelation will find it.
+ * Can be used if the file is an io_uring instance, or if the request itself
+ * relies on ->mm being alive for the duration of the request.
+ */
+inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
@@ -788,6 +788,21 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
return true;
}
+static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
+ struct io_uring_cqe src_cqe[2])
+{
+ struct io_uring_cqe *cqe;
+
+ if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ return false;
+ if (unlikely(!io_get_cqe(ctx, &cqe)))
+ return false;
+
+ memcpy(cqe, src_cqe, 2 * sizeof(*cqe));
+ trace_io_uring_complete(ctx, NULL, cqe);
+ return true;
+}
+
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
@@ -899,6 +914,31 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
return posted;
}
+/*
+ * A helper for multishot requests posting additional CQEs.
+ * Should only be used from a task_work including IO_URING_F_MULTISHOT.
+ */
+bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2])
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ lockdep_assert(!io_wq_current_is_worker());
+ lockdep_assert_held(&ctx->uring_lock);
+
+ cqe[0].user_data = req->cqe.user_data;
+ if (!ctx->lockless_cq) {
+ spin_lock(&ctx->completion_lock);
+ posted = io_fill_cqe_aux32(ctx, cqe);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ posted = io_fill_cqe_aux32(ctx, cqe);
+ }
+
+ ctx->submit_state.cq_flush = true;
+ return posted;
+}
+
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1372,7 +1412,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
else if (req->flags & REQ_F_FORCE_ASYNC)
io_queue_iowq(req);
else
- io_queue_sqe(req);
+ io_queue_sqe(req, 0);
}
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1518,6 +1558,9 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
}
}
mutex_unlock(&ctx->uring_lock);
+
+ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+ io_move_task_work_from_local(ctx);
}
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
@@ -1930,14 +1973,34 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
return file;
}
-static void io_queue_async(struct io_kiocb *req, int ret)
+static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags)
+{
+ const struct io_cold_def *def = &io_cold_defs[req->opcode];
+
+ if (req->flags & REQ_F_SQE_COPIED)
+ return 0;
+ req->flags |= REQ_F_SQE_COPIED;
+ if (!def->sqe_copy)
+ return 0;
+ if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE)))
+ return -EFAULT;
+ def->sqe_copy(req);
+ return 0;
+}
+
+static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret)
__must_hold(&req->ctx->uring_lock)
{
if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
+fail:
io_req_defer_failed(req, ret);
return;
}
+ ret = io_req_sqe_copy(req, issue_flags);
+ if (unlikely(ret))
+ goto fail;
+
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_kbuf_recycle(req, 0);
@@ -1952,19 +2015,21 @@ static void io_queue_async(struct io_kiocb *req, int ret)
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags)
__must_hold(&req->ctx->uring_lock)
{
+ unsigned int issue_flags = IO_URING_F_NONBLOCK |
+ IO_URING_F_COMPLETE_DEFER | extra_flags;
int ret;
- ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ ret = io_issue_sqe(req, issue_flags);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (unlikely(ret))
- io_queue_async(req, ret);
+ io_queue_async(req, issue_flags, ret);
}
static void io_queue_sqe_fallback(struct io_kiocb *req)
@@ -1979,6 +2044,8 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
req->flags |= REQ_F_LINK;
io_req_defer_failed(req, req->cqe.res);
} else {
+ /* can't fail with IO_URING_F_INLINE */
+ io_req_sqe_copy(req, IO_URING_F_INLINE);
if (unlikely(req->ctx->drain_active))
io_drain_req(req);
else
@@ -2190,6 +2257,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
*/
if (unlikely(link->head)) {
trace_io_uring_link(req, link->last);
+ io_req_sqe_copy(req, IO_URING_F_INLINE);
link->last->link = req;
link->last = req;
@@ -2213,7 +2281,7 @@ fallback:
return 0;
}
- io_queue_sqe(req);
+ io_queue_sqe(req, IO_URING_F_INLINE);
return 0;
}
@@ -2901,7 +2969,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
struct task_struct *tsk;
io_sq_thread_park(sqd);
- tsk = sqd->thread;
+ tsk = sqpoll_task_locked(sqd);
if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
io_wq_cancel_cb(tsk->io_uring->io_wq,
io_cancel_ctx_cb, ctx, true);
@@ -3137,7 +3205,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
s64 inflight;
DEFINE_WAIT(wait);
- WARN_ON_ONCE(sqd && sqd->thread != current);
+ WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current);
if (!current->io_uring)
return;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0ea7a435d1de..abc6de227f74 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -81,8 +81,10 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
+bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
+void io_req_track_inflight(struct io_kiocb *req);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
@@ -97,8 +99,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *coun
struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx);
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end);
@@ -294,11 +294,22 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static inline void __io_wq_wake(struct wait_queue_head *wq)
+{
+ /*
+ *
+ * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
+ * set in the mask so that if we recurse back into our own poll
+ * waitqueue handlers, we know we have a dependency between eventfd or
+ * epoll and should terminate multishot poll at that point.
+ */
+ if (wq_has_sleeper(wq))
+ __wake_up(wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+}
+
static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
{
- if (wq_has_sleeper(&ctx->poll_wq))
- __wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
- poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+ __io_wq_wake(&ctx->poll_wq);
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
@@ -307,15 +318,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
* Trigger waitqueue handler on all waiters on our waitqueue. This
* won't necessarily wake up all the tasks, io_should_wake() will make
* that decision.
- *
- * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
- * set in the mask so that if we recurse back into our own poll
- * waitqueue handlers, we know we have a dependency between eventfd or
- * epoll and should terminate multishot poll at that point.
*/
- if (wq_has_sleeper(&ctx->cq_wait))
- __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
- poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+
+ __io_wq_wake(&ctx->cq_wait);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 8cce3ebd813f..f2d2cc319faa 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -108,6 +108,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
+ bl->nbufs++;
req->flags &= ~REQ_F_BUFFER_SELECTED;
io_ring_submit_unlock(ctx, issue_flags);
@@ -122,6 +123,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
+ bl->nbufs--;
if (*len == 0 || *len > kbuf->len)
*len = kbuf->len;
if (list_empty(&bl->buf_list))
@@ -268,8 +270,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
/* truncate end piece, if needed, for non partial buffers */
if (len > arg->max_len) {
len = arg->max_len;
- if (!(bl->flags & IOBL_INC))
+ if (!(bl->flags & IOBL_INC)) {
+ arg->partial_map = 1;
+ if (iov != arg->iovs)
+ break;
buf->len = len;
+ }
}
iov->iov_base = u64_to_user_ptr(buf->addr);
@@ -390,6 +396,7 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
+ bl->nbufs--;
kfree(nxt);
cond_resched();
}
@@ -491,14 +498,24 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
- int i, bid = pbuf->bid;
+ int ret = -ENOMEM, i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
+ /*
+ * Nonsensical to have more than sizeof(bid) buffers in a
+ * buffer list, as the application then has no way of knowing
+ * which duplicate bid refers to what buffer.
+ */
+ if (bl->nbufs == USHRT_MAX) {
+ ret = -EOVERFLOW;
+ break;
+ }
buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
if (!buf)
break;
list_add_tail(&buf->list, &bl->buf_list);
+ bl->nbufs++;
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
@@ -508,7 +525,7 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
cond_resched();
}
- return i ? 0 : -ENOMEM;
+ return i ? 0 : ret;
}
static int __io_manage_buffers_legacy(struct io_kiocb *req,
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 4d2c209d1a41..723d0361898e 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -21,6 +21,9 @@ struct io_buffer_list {
struct list_head buf_list;
struct io_uring_buf_ring *buf_ring;
};
+ /* count of classic/legacy buffers in buffer list */
+ int nbufs;
+
__u16 bgid;
/* below is for ring provided buffers */
@@ -55,7 +58,8 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
- unsigned buf_group;
+ unsigned short buf_group;
+ unsigned short partial_map;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
new file mode 100644
index 000000000000..45d3735b2708
--- /dev/null
+++ b/io_uring/mock_file.c
@@ -0,0 +1,363 @@
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/anon_inodes.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/poll.h>
+
+#include <linux/io_uring/cmd.h>
+#include <linux/io_uring_types.h>
+#include <uapi/linux/io_uring/mock_file.h>
+
+struct io_mock_iocb {
+ struct kiocb *iocb;
+ struct hrtimer timer;
+ int res;
+};
+
+struct io_mock_file {
+ size_t size;
+ u64 rw_delay_ns;
+ bool pollable;
+ struct wait_queue_head poll_wq;
+};
+
+#define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM
+
+static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf)
+{
+ size_t ret, copied = 0;
+ size_t buflen = PAGE_SIZE;
+ void *tmp_buf;
+
+ tmp_buf = kzalloc(buflen, GFP_KERNEL);
+ if (!tmp_buf)
+ return -ENOMEM;
+
+ while (iov_iter_count(reg_iter)) {
+ size_t len = min(iov_iter_count(reg_iter), buflen);
+
+ if (iov_iter_rw(reg_iter) == ITER_SOURCE) {
+ ret = copy_from_iter(tmp_buf, len, reg_iter);
+ if (ret <= 0)
+ break;
+ if (copy_to_user(ubuf, tmp_buf, ret))
+ break;
+ } else {
+ if (copy_from_user(tmp_buf, ubuf, len))
+ break;
+ ret = copy_to_iter(tmp_buf, len, reg_iter);
+ if (ret <= 0)
+ break;
+ }
+ ubuf += ret;
+ copied += ret;
+ }
+
+ kfree(tmp_buf);
+ return copied;
+}
+
+static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ const struct iovec __user *iovec;
+ unsigned flags, iovec_len;
+ struct iov_iter iter;
+ void __user *ubuf;
+ int dir, ret;
+
+ ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+ iovec = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ iovec_len = READ_ONCE(sqe->len);
+ flags = READ_ONCE(sqe->file_index);
+
+ if (unlikely(sqe->ioprio || sqe->__pad1))
+ return -EINVAL;
+ if (flags & ~IO_VALID_COPY_CMD_FLAGS)
+ return -EINVAL;
+
+ dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST;
+ ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter,
+ issue_flags);
+ if (ret)
+ return ret;
+ ret = io_copy_regbuf(&iter, ubuf);
+ return ret ? ret : -EFAULT;
+}
+
+static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ switch (cmd->cmd_op) {
+ case IORING_MOCK_CMD_COPY_REGBUF:
+ return io_cmd_copy_regbuf(cmd, issue_flags);
+ }
+ return -ENOTSUPP;
+}
+
+static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer)
+{
+ struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer);
+ struct kiocb *iocb = mio->iocb;
+
+ WRITE_ONCE(iocb->private, NULL);
+ iocb->ki_complete(iocb, mio->res);
+ kfree(mio);
+ return HRTIMER_NORESTART;
+}
+
+static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len)
+{
+ struct io_mock_file *mf = iocb->ki_filp->private_data;
+ struct io_mock_iocb *mio;
+
+ mio = kzalloc(sizeof(*mio), GFP_KERNEL);
+ if (!mio)
+ return -ENOMEM;
+
+ mio->iocb = iocb;
+ mio->res = len;
+ hrtimer_setup(&mio->timer, io_mock_rw_timer_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns),
+ HRTIMER_MODE_REL);
+ return -EIOCBQUEUED;
+}
+
+static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct io_mock_file *mf = iocb->ki_filp->private_data;
+ size_t len = iov_iter_count(to);
+ size_t nr_zeroed;
+
+ if (iocb->ki_pos + len > mf->size)
+ return -EINVAL;
+ nr_zeroed = iov_iter_zero(len, to);
+ if (!mf->rw_delay_ns || nr_zeroed != len)
+ return nr_zeroed;
+
+ return io_mock_delay_rw(iocb, len);
+}
+
+static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct io_mock_file *mf = iocb->ki_filp->private_data;
+ size_t len = iov_iter_count(from);
+
+ if (iocb->ki_pos + len > mf->size)
+ return -EINVAL;
+ if (!mf->rw_delay_ns) {
+ iov_iter_advance(from, len);
+ return len;
+ }
+
+ return io_mock_delay_rw(iocb, len);
+}
+
+static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct io_mock_file *mf = file->private_data;
+
+ return fixed_size_llseek(file, offset, whence, mf->size);
+}
+
+static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt)
+{
+ struct io_mock_file *mf = file->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(file, &mf->poll_wq, pt);
+
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
+ return mask;
+}
+
+static int io_mock_release(struct inode *inode, struct file *file)
+{
+ struct io_mock_file *mf = file->private_data;
+
+ kfree(mf);
+ return 0;
+}
+
+static const struct file_operations io_mock_fops = {
+ .owner = THIS_MODULE,
+ .release = io_mock_release,
+ .uring_cmd = io_mock_cmd,
+ .read_iter = io_mock_read_iter,
+ .write_iter = io_mock_write_iter,
+ .llseek = io_mock_llseek,
+};
+
+static const struct file_operations io_mock_poll_fops = {
+ .owner = THIS_MODULE,
+ .release = io_mock_release,
+ .uring_cmd = io_mock_cmd,
+ .read_iter = io_mock_read_iter,
+ .write_iter = io_mock_write_iter,
+ .llseek = io_mock_llseek,
+ .poll = io_mock_poll,
+};
+
+#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \
+ IORING_MOCK_CREATE_F_POLL)
+
+static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ const struct file_operations *fops = &io_mock_fops;
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ struct io_uring_mock_create mc, __user *uarg;
+ struct io_mock_file *mf = NULL;
+ struct file *file = NULL;
+ size_t uarg_size;
+ int fd = -1, ret;
+
+ /*
+ * It's a testing only driver that allows exercising edge cases
+ * that wouldn't be possible to hit otherwise.
+ */
+ add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
+
+ uarg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ uarg_size = READ_ONCE(sqe->len);
+
+ if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index)
+ return -EINVAL;
+ if (uarg_size != sizeof(mc))
+ return -EINVAL;
+
+ memset(&mc, 0, sizeof(mc));
+ if (copy_from_user(&mc, uarg, uarg_size))
+ return -EFAULT;
+ if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)))
+ return -EINVAL;
+ if (mc.flags & ~IO_VALID_CREATE_FLAGS)
+ return -EINVAL;
+ if (mc.file_size > SZ_1G)
+ return -EINVAL;
+ if (mc.rw_delay_ns > NSEC_PER_SEC)
+ return -EINVAL;
+
+ mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT);
+ if (!mf)
+ return -ENOMEM;
+
+ ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ goto fail;
+
+ init_waitqueue_head(&mf->poll_wq);
+ mf->size = mc.file_size;
+ mf->rw_delay_ns = mc.rw_delay_ns;
+ if (mc.flags & IORING_MOCK_CREATE_F_POLL) {
+ fops = &io_mock_poll_fops;
+ mf->pollable = true;
+ }
+
+ file = anon_inode_create_getfile("[io_uring_mock]", fops,
+ mf, O_RDWR | O_CLOEXEC, NULL);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto fail;
+ }
+
+ file->f_mode |= FMODE_READ | FMODE_CAN_READ |
+ FMODE_WRITE | FMODE_CAN_WRITE |
+ FMODE_LSEEK;
+ if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT)
+ file->f_mode |= FMODE_NOWAIT;
+
+ mc.out_fd = fd;
+ if (copy_to_user(uarg, &mc, uarg_size)) {
+ fput(file);
+ ret = -EFAULT;
+ goto fail;
+ }
+
+ fd_install(fd, file);
+ return 0;
+fail:
+ if (fd >= 0)
+ put_unused_fd(fd);
+ kfree(mf);
+ return ret;
+}
+
+static int io_probe_mock(struct io_uring_cmd *cmd)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ struct io_uring_mock_probe mp, __user *uarg;
+ size_t uarg_size;
+
+ uarg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ uarg_size = READ_ONCE(sqe->len);
+
+ if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index ||
+ uarg_size != sizeof(mp))
+ return -EINVAL;
+
+ memset(&mp, 0, sizeof(mp));
+ if (copy_from_user(&mp, uarg, uarg_size))
+ return -EFAULT;
+ if (!mem_is_zero(&mp, sizeof(mp)))
+ return -EINVAL;
+
+ mp.features = IORING_MOCK_FEAT_END;
+
+ if (copy_to_user(uarg, &mp, uarg_size))
+ return -EFAULT;
+ return 0;
+}
+
+static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd->cmd_op) {
+ case IORING_MOCK_MGR_CMD_PROBE:
+ return io_probe_mock(cmd);
+ case IORING_MOCK_MGR_CMD_CREATE:
+ return io_create_mock_file(cmd, issue_flags);
+ }
+ return -EOPNOTSUPP;
+}
+
+static const struct file_operations iou_mock_dev_fops = {
+ .owner = THIS_MODULE,
+ .uring_cmd = iou_mock_mgr_cmd,
+};
+
+static struct miscdevice iou_mock_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "io_uring_mock",
+ .fops = &iou_mock_dev_fops,
+};
+
+static int __init io_mock_init(void)
+{
+ int ret;
+
+ ret = misc_register(&iou_mock_miscdev);
+ if (ret < 0) {
+ pr_err("Could not initialize io_uring mock device\n");
+ return ret;
+ }
+ return 0;
+}
+
+static void __exit io_mock_exit(void)
+{
+ misc_deregister(&iou_mock_miscdev);
+}
+
+module_init(io_mock_init)
+module_exit(io_mock_exit)
+
+MODULE_AUTHOR("Pavel Begunkov <asml.silence@gmail.com>");
+MODULE_DESCRIPTION("io_uring mock file");
+MODULE_LICENSE("GPL");
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 71400d6cefc8..4c2578f2efcb 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
spin_unlock(&ctx->msg_lock);
}
if (req)
- kmem_cache_free(req_cachep, req);
+ kfree_rcu(req, rcu_head);
percpu_ref_put(&ctx->refs);
}
@@ -90,7 +90,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
int res, u32 cflags, u64 user_data)
{
if (!READ_ONCE(ctx->submitter_task)) {
- kmem_cache_free(req_cachep, req);
+ kfree_rcu(req, rcu_head);
return -EOWNERDEAD;
}
req->opcode = IORING_OP_NOP;
diff --git a/io_uring/net.c b/io_uring/net.c
index d13f3e8f6c72..35585bdc59f3 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -75,13 +75,32 @@ struct io_sr_msg {
u16 flags;
/* initialised and used only by !msg send variants */
u16 buf_group;
- bool retry;
+ /* per-invocation mshot limit */
+ unsigned mshot_len;
+ /* overall mshot byte limit */
+ unsigned mshot_total_len;
void __user *msg_control;
/* used only for send zerocopy */
struct io_kiocb *notif;
};
/*
+ * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
+ * anyway. Use the upper 8 bits for internal uses.
+ */
+enum sr_retry_flags {
+ IORING_RECV_RETRY = (1U << 15),
+ IORING_RECV_PARTIAL_MAP = (1U << 14),
+ IORING_RECV_MSHOT_CAP = (1U << 13),
+ IORING_RECV_MSHOT_LIM = (1U << 12),
+ IORING_RECV_MSHOT_DONE = (1U << 11),
+
+ IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
+ IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
+ IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
+};
+
+/*
* Number of times we'll try and do receives if there's more data. If we
* exceed this limit, then add us to the back of the queue and retry from
* there. This helps fairness between flooding clients.
@@ -187,8 +206,8 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
req->flags &= ~REQ_F_BL_EMPTY;
sr->done_io = 0;
- sr->retry = false;
- sr->len = 0; /* get from the provided buffer */
+ sr->flags &= ~IORING_RECV_RETRY_CLEAR;
+ sr->len = sr->mshot_len;
}
static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@@ -397,7 +416,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry = false;
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~SENDMSG_FLAGS)
@@ -751,9 +769,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry = false;
- if (unlikely(sqe->file_index || sqe->addr2))
+ if (unlikely(sqe->addr2))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -778,15 +795,25 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->buf_group = req->buf_index;
req->buf_list = NULL;
}
+ sr->mshot_total_len = sr->mshot_len = 0;
if (sr->flags & IORING_RECV_MULTISHOT) {
if (!(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (sr->msg_flags & MSG_WAITALL)
return -EINVAL;
- if (req->opcode == IORING_OP_RECV && sr->len)
+ if (req->opcode == IORING_OP_RECV) {
+ sr->mshot_len = sr->len;
+ sr->mshot_total_len = READ_ONCE(sqe->optlen);
+ if (sr->mshot_total_len)
+ sr->flags |= IORING_RECV_MSHOT_LIM;
+ } else if (sqe->optlen) {
return -EINVAL;
+ }
req->flags |= REQ_F_APOLL_MULTISHOT;
+ } else if (sqe->optlen) {
+ return -EINVAL;
}
+
if (sr->flags & IORING_RECVSEND_BUNDLE) {
if (req->opcode == IORING_OP_RECVMSG)
return -EINVAL;
@@ -818,13 +845,28 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
+ /*
+ * If sr->len hits zero, the limit has been reached. Mark
+ * mshot as finished, and flag MSHOT_DONE as well to prevent
+ * a potential bundle from being retried.
+ */
+ sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len);
+ if (!sr->mshot_total_len) {
+ sr->flags |= IORING_RECV_MSHOT_DONE;
+ mshot_finished = true;
+ }
+ }
+
if (sr->flags & IORING_RECVSEND_BUNDLE) {
size_t this_ret = *ret - sr->done_io;
- cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret),
+ cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret),
issue_flags);
- if (sr->retry)
+ if (sr->flags & IORING_RECV_RETRY)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
+ if (sr->mshot_len && *ret >= sr->mshot_len)
+ sr->flags |= IORING_RECV_MSHOT_CAP;
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
goto finish;
@@ -832,12 +874,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
* If more is available AND it was a full transfer, retry and
* append to this one
*/
- if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 &&
+ if (!(sr->flags & IORING_RECV_NO_RETRY) &&
+ kmsg->msg.msg_inq > 1 && this_ret > 0 &&
!iov_iter_count(&kmsg->msg.msg_iter)) {
req->cqe.flags = cflags & ~CQE_F_MASK;
sr->len = kmsg->msg.msg_inq;
sr->done_io += this_ret;
- sr->retry = true;
+ sr->flags |= IORING_RECV_RETRY;
return false;
}
} else {
@@ -854,10 +897,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
- if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
+ if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
+ !(sr->flags & IORING_RECV_MSHOT_CAP)) {
return false;
+ }
/* mshot retries exceeded, force a requeue */
sr->nr_multishot_loops = 0;
+ sr->flags &= ~IORING_RECV_MSHOT_CAP;
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_REQUEUE;
}
@@ -1070,13 +1116,26 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
arg.mode |= KBUF_MODE_FREE;
}
- if (kmsg->msg.msg_inq > 0)
- arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
+ if (*len)
+ arg.max_len = *len;
+ else if (kmsg->msg.msg_inq > 1)
+ arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq);
+ /* if mshot limited, ensure we don't go over */
+ if (sr->flags & IORING_RECV_MSHOT_LIM)
+ arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
ret = io_buffers_peek(req, &arg);
if (unlikely(ret < 0))
return ret;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ if (arg.partial_map)
+ sr->flags |= IORING_RECV_PARTIAL_MAP;
+
/* special case 1 vec, can be a fast path */
if (ret == 1) {
sr->buf = arg.iovs[0].iov_base;
@@ -1085,11 +1144,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
}
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
- kmsg->vec.nr = ret;
- kmsg->vec.iovec = arg.iovs;
- req->flags |= REQ_F_NEED_CLEANUP;
- }
} else {
void __user *buf;
@@ -1275,7 +1329,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
zc->done_io = 0;
- zc->retry = false;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
@@ -1730,9 +1783,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (unlikely(req->flags & REQ_F_FAIL)) {
- ret = -ECONNRESET;
- goto out;
+ if (connect->in_progress) {
+ struct poll_table_struct pt = { ._key = EPOLLERR };
+
+ if (vfs_poll(req->file, &pt) & EPOLLERR)
+ goto get_sock_err;
}
file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -1757,8 +1812,10 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
* which means the previous result is good. For both of these,
* grab the sock_error() and use that for the completion.
*/
- if (ret == -EBADFD || ret == -EISCONN)
+ if (ret == -EBADFD || ret == -EISCONN) {
+get_sock_err:
ret = sock_error(sock_from_file(req->file)->sk);
+ }
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 6ac2de761fd3..20ed0f85b1c2 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -20,7 +20,8 @@ struct io_nop {
};
#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
- IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE)
+ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \
+ IORING_NOP_TW)
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -68,5 +69,10 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, nop->result, 0);
+ if (nop->flags & IORING_NOP_TW) {
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
return IOU_COMPLETE;
}
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 6e0882b051f9..9568785810d9 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .hash_reg_file = 1,
.prep = io_fallocate_prep,
.issue = io_fallocate,
},
@@ -759,6 +760,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
+ .sqe_copy = io_uring_cmd_sqe_copy,
.cleanup = io_uring_cmd_cleanup,
},
[IORING_OP_SEND_ZC] = {
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 719a52104abe..c2f0907ed78c 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -38,6 +38,7 @@ struct io_issue_def {
struct io_cold_def {
const char *name;
+ void (*sqe_copy)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *);
void (*fail)(struct io_kiocb *);
};
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 83e36ad4e31b..d70700e5cef8 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -416,8 +416,6 @@ int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
ret = create_pipe_files(files, p->flags);
if (ret)
return ret;
- files[0]->f_mode |= FMODE_NOWAIT;
- files[1]->f_mode |= FMODE_NOWAIT;
if (!!p->file_slot)
ret = io_pipe_fixed(req, files, issue_flags);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 0526062e2f81..c786e587563b 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -273,8 +273,6 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
return IOU_POLL_REISSUE;
}
}
- if (unlikely(req->cqe.res & EPOLLERR))
- req_set_fail(req);
if (req->apoll_events & EPOLLONESHOT)
return IOU_POLL_DONE;
@@ -669,33 +667,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
return apoll;
}
-int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
+int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask)
{
- const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t mask = POLLPRI | POLLERR | EPOLLET;
int ret;
- if (!def->pollin && !def->pollout)
- return IO_APOLL_ABORTED;
+ mask |= EPOLLET;
if (!io_file_can_poll(req))
return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
- if (def->pollin) {
- mask |= EPOLLIN | EPOLLRDNORM;
-
- /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
- if (req->flags & REQ_F_CLEAR_POLLIN)
- mask &= ~EPOLLIN;
- } else {
- mask |= EPOLLOUT | EPOLLWRNORM;
- }
- if (def->poll_exclusive)
- mask |= EPOLLEXCLUSIVE;
-
apoll = io_req_alloc_apoll(req, issue_flags);
if (!apoll)
return IO_APOLL_ABORTED;
@@ -712,6 +695,31 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_OK;
}
+int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
+{
+ const struct io_issue_def *def = &io_issue_defs[req->opcode];
+ __poll_t mask = POLLPRI | POLLERR;
+
+ if (!def->pollin && !def->pollout)
+ return IO_APOLL_ABORTED;
+ if (!io_file_can_poll(req))
+ return IO_APOLL_ABORTED;
+
+ if (def->pollin) {
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
+ if (req->flags & REQ_F_CLEAR_POLLIN)
+ mask &= ~EPOLLIN;
+ } else {
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ }
+ if (def->poll_exclusive)
+ mask |= EPOLLEXCLUSIVE;
+
+ return io_arm_apoll(req, issue_flags, mask);
+}
+
/*
* Returns true if we found and killed one or more poll requests
*/
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 27e2db2ed4ae..c8438286dfa0 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -41,6 +41,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
struct io_cancel_data;
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned issue_flags);
+int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask);
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all);
diff --git a/io_uring/register.c b/io_uring/register.c
index cc23a4c205cd..a59589249fce 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -273,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
+ struct task_struct *tsk;
+
/*
* Observe the correct sqd->lock -> ctx->uring_lock
* ordering. Fine to drop uring_lock here, we hold
@@ -282,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk)
+ tctx = tsk->io_uring;
}
} else {
tctx = current->io_uring;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index c592ceace97d..f75f5e43fa4a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -55,7 +55,7 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
return 0;
}
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
if (ctx->user)
__io_unaccount_mem(ctx->user, nr_pages);
@@ -64,7 +64,7 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
@@ -112,8 +112,11 @@ static void io_release_ubuf(void *priv)
struct io_mapped_ubuf *imu = priv;
unsigned int i;
- for (i = 0; i < imu->nr_bvecs; i++)
- unpin_user_page(imu->bvec[i].bv_page);
+ for (i = 0; i < imu->nr_bvecs; i++) {
+ struct folio *folio = page_folio(imu->bvec[i].bv_page);
+
+ unpin_user_folio(folio, 1);
+ }
}
static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
@@ -135,8 +138,10 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
{
- if (!refcount_dec_and_test(&imu->refs))
- return;
+ if (unlikely(refcount_read(&imu->refs) > 1)) {
+ if (!refcount_dec_and_test(&imu->refs))
+ return;
+ }
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
@@ -731,6 +736,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
data->nr_pages_mid = folio_nr_pages(folio);
data->folio_shift = folio_shift(folio);
+ data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
/*
* Check if pages are contiguous inside a folio, and all folios have
@@ -809,10 +815,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
imu->nr_bvecs = nr_pages;
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
- if (ret) {
- unpin_user_pages(pages, nr_pages);
+ if (ret)
goto done;
- }
size = iov->iov_len;
/* store original address for later verification */
@@ -826,7 +830,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
- off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
+
+ off = (unsigned long)iov->iov_base & ~PAGE_MASK;
+ if (coalesced)
+ off += data.first_folio_page_idx << PAGE_SHIFT;
+
node->buf = imu;
ret = 0;
@@ -842,6 +850,10 @@ done:
if (ret) {
if (imu)
io_free_imu(ctx, imu);
+ if (pages) {
+ for (i = 0; i < nr_pages; i++)
+ unpin_user_folio(page_folio(pages[i]), 1);
+ }
io_cache_free(&ctx->node_cache, node);
node = ERR_PTR(ret);
}
@@ -1177,6 +1189,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
return -EINVAL;
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
+ if (nbufs > IORING_MAX_REG_BUFFERS)
+ return -EINVAL;
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
if (ret)
@@ -1327,7 +1341,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
{
unsigned long folio_size = 1 << imu->folio_shift;
unsigned long folio_mask = folio_size - 1;
- u64 folio_addr = imu->ubuf & ~folio_mask;
struct bio_vec *res_bvec = vec->bvec;
size_t total_len = 0;
unsigned bvec_idx = 0;
@@ -1349,8 +1362,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
return -EOVERFLOW;
- /* by using folio address it also accounts for bvec offset */
- offset = buf_addr - folio_addr;
+ offset = buf_addr - imu->ubuf;
+ /*
+ * Only the first bvec can have non zero bv_offset, account it
+ * here and work with full folios below.
+ */
+ offset += imu->bvec[0].bv_offset;
+
src_bvec = imu->bvec + (offset >> imu->folio_shift);
offset &= folio_mask;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 0d2138f16322..a3ca6ba66596 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,6 +49,7 @@ struct io_imu_folio_data {
unsigned int nr_pages_mid;
unsigned int folio_shift;
unsigned int nr_folios;
+ unsigned long first_folio_page_idx;
};
bool io_rsrc_cache_init(struct io_ring_ctx *ctx);
@@ -119,6 +120,8 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
+int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages);
+void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages);
static inline void __io_unaccount_mem(struct user_struct *user,
unsigned long nr_pages)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 710d8cd53ebb..52a5b950b2e5 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -288,7 +288,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
- rw->flags = READ_ONCE(sqe->rw_flags);
+ rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags);
attr_type_mask = READ_ONCE(sqe->attr_type_mask);
if (attr_type_mask) {
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 03c699493b5a..a3f11349ce06 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -16,6 +16,7 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "tctx.h"
#include "napi.h"
#include "sqpoll.h"
@@ -30,7 +31,7 @@ enum {
void io_sq_thread_unpark(struct io_sq_data *sqd)
__releases(&sqd->lock)
{
- WARN_ON_ONCE(sqd->thread == current);
+ WARN_ON_ONCE(sqpoll_task_locked(sqd) == current);
/*
* Do the dance but not conditional clear_bit() because it'd race with
@@ -46,24 +47,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd)
void io_sq_thread_park(struct io_sq_data *sqd)
__acquires(&sqd->lock)
{
- WARN_ON_ONCE(data_race(sqd->thread) == current);
+ struct task_struct *tsk;
atomic_inc(&sqd->park_pending);
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
+
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk) {
+ WARN_ON_ONCE(tsk == current);
+ wake_up_process(tsk);
+ }
}
void io_sq_thread_stop(struct io_sq_data *sqd)
{
- WARN_ON_ONCE(sqd->thread == current);
+ struct task_struct *tsk;
+
WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk) {
+ WARN_ON_ONCE(tsk == current);
+ wake_up_process(tsk);
+ }
mutex_unlock(&sqd->lock);
wait_for_completion(&sqd->exited);
}
@@ -270,7 +279,8 @@ static int io_sq_thread(void *data)
/* offload context creation failed, just exit */
if (!current->io_uring) {
mutex_lock(&sqd->lock);
- sqd->thread = NULL;
+ rcu_assign_pointer(sqd->thread, NULL);
+ put_task_struct(current);
mutex_unlock(&sqd->lock);
goto err_out;
}
@@ -379,7 +389,8 @@ static int io_sq_thread(void *data)
io_sq_tw(&retry_list, UINT_MAX);
io_uring_cancel_generic(true, sqd);
- sqd->thread = NULL;
+ rcu_assign_pointer(sqd->thread, NULL);
+ put_task_struct(current);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
io_run_task_work();
@@ -409,7 +420,6 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
- struct task_struct *task_to_put = NULL;
int ret;
/* Retain compatibility with failing for an invalid attach attempt */
@@ -484,8 +494,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
goto err_sqpoll;
}
- sqd->thread = tsk;
- task_to_put = get_task_struct(tsk);
+ mutex_lock(&sqd->lock);
+ rcu_assign_pointer(sqd->thread, tsk);
+ mutex_unlock(&sqd->lock);
+
+ get_task_struct(tsk);
ret = io_uring_alloc_task_context(tsk, ctx);
wake_up_new_task(tsk);
if (ret)
@@ -495,16 +508,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
ret = -EINVAL;
goto err;
}
-
- if (task_to_put)
- put_task_struct(task_to_put);
return 0;
err_sqpoll:
complete(&ctx->sq_data->exited);
err:
io_sq_thread_finish(ctx);
- if (task_to_put)
- put_task_struct(task_to_put);
return ret;
}
@@ -515,10 +523,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
int ret = -EINVAL;
if (sqd) {
+ struct task_struct *tsk;
+
io_sq_thread_park(sqd);
/* Don't set affinity for a dying thread */
- if (sqd->thread)
- ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk)
+ ret = io_wq_cpu_affinity(tsk->io_uring, mask);
io_sq_thread_unpark(sqd);
}
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index 4171666b1cf4..b83dcdec9765 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -8,7 +8,7 @@ struct io_sq_data {
/* ctx's that are using this sqd */
struct list_head ctx_list;
- struct task_struct *thread;
+ struct task_struct __rcu *thread;
struct wait_queue_head wait;
unsigned sq_thread_idle;
@@ -29,3 +29,9 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+
+static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
+{
+ return rcu_dereference_protected(sqd->thread,
+ lockdep_is_held(&sqd->lock));
+}
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 929cad6ee326..053bac89b6c0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -12,6 +12,7 @@
#include "alloc_cache.h"
#include "rsrc.h"
#include "uring_cmd.h"
+#include "poll.h"
void io_cmd_cache_free(const void *entry)
{
@@ -25,12 +26,6 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
struct io_async_cmd *ac = req->async_data;
- struct io_uring_cmd_data *cache = &ac->data;
-
- if (cache->op_data) {
- kfree(cache->op_data);
- cache->op_data = NULL;
- }
if (issue_flags & IO_URING_F_UNLOCKED)
return;
@@ -39,7 +34,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&ac->vec);
- if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) {
+ if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) {
ioucmd->sqe = NULL;
req->async_data = NULL;
req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
@@ -136,6 +131,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
+ return;
+
ioucmd->task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
__io_req_task_work_add(req, flags);
@@ -158,6 +156,9 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
+ return;
+
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
if (ret < 0)
@@ -181,35 +182,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
-static int io_uring_cmd_prep_setup(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- struct io_async_cmd *ac;
-
- /* see io_uring_cmd_get_async_data() */
- BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0);
-
- ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
- if (!ac)
- return -ENOMEM;
- ac->data.op_data = NULL;
-
- /*
- * Unconditionally cache the SQE for now - this is only needed for
- * requests that go async, but prep handlers must ensure that any
- * sqe data is stable beyond prep. Since uring_cmd is special in
- * that it doesn't read in per-op data, play it safe and ensure that
- * any SQE data is stable beyond prep. This can later get relaxed.
- */
- memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx));
- ioucmd->sqe = ac->sqes;
- return 0;
-}
-
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ struct io_async_cmd *ac;
if (sqe->__pad1)
return -EINVAL;
@@ -223,7 +199,23 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
- return io_uring_cmd_prep_setup(req, sqe);
+ ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
+ if (!ac)
+ return -ENOMEM;
+ ioucmd->sqe = sqe;
+ return 0;
+}
+
+void io_uring_cmd_sqe_copy(struct io_kiocb *req)
+{
+ struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ struct io_async_cmd *ac = req->async_data;
+
+ /* Should not happen, as REQ_F_SQE_COPIED covers this */
+ if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
+ return;
+ memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
+ ioucmd->sqe = ac->sqes;
}
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
@@ -259,7 +251,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
- if (ret == -EAGAIN || ret == -EIOCBQUEUED)
+ if (ret == -EAGAIN) {
+ ioucmd->flags |= IORING_URING_CMD_REISSUE;
+ return ret;
+ }
+ if (ret == -EIOCBQUEUED)
return ret;
if (ret < 0)
req_set_fail(req);
@@ -310,3 +306,30 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
io_req_queue_iowq(req);
}
+
+int io_cmd_poll_multishot(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, __poll_t mask)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ int ret;
+
+ if (likely(req->flags & REQ_F_APOLL_MULTISHOT))
+ return 0;
+
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ mask &= ~EPOLLONESHOT;
+
+ ret = io_arm_apoll(req, issue_flags, mask);
+ return ret == IO_APOLL_OK ? -EIOCBQUEUED : -ECANCELED;
+}
+
+bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct io_uring_cqe cqe[2])
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+
+ if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_MULTISHOT)))
+ return false;
+ return io_req_post_cqe32(req, cqe);
+}
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index e6a5142c890e..041aef8a8aa3 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -4,16 +4,23 @@
#include <linux/io_uring_types.h>
struct io_async_cmd {
- struct io_uring_cmd_data data;
struct iou_vec vec;
struct io_uring_sqe sqes[2];
};
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+void io_uring_cmd_sqe_copy(struct io_kiocb *req);
void io_uring_cmd_cleanup(struct io_kiocb *req);
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all);
+bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct io_uring_cqe cqe[2]);
+
void io_cmd_cache_free(const void *entry);
+
+int io_cmd_poll_multishot(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, __poll_t mask);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 1513431587a7..e5ff49f3425e 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -44,9 +44,40 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+ lockdep_assert(!area->mem.is_dmabuf);
+
return area->mem.pages[net_iov_idx(niov)];
}
+static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area,
+ struct sg_table *sgt, unsigned long off)
+{
+ struct scatterlist *sg;
+ unsigned i, niov_idx = 0;
+
+ for_each_sgtable_dma_sg(sgt, sg, i) {
+ dma_addr_t dma = sg_dma_address(sg);
+ unsigned long sg_len = sg_dma_len(sg);
+ unsigned long sg_off = min(sg_len, off);
+
+ off -= sg_off;
+ sg_len -= sg_off;
+ dma += sg_off;
+
+ while (sg_len && niov_idx < area->nia.num_niovs) {
+ struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+ if (net_mp_niov_set_dma_addr(niov, dma))
+ return -EFAULT;
+ sg_len -= PAGE_SIZE;
+ dma += PAGE_SIZE;
+ niov_idx++;
+ }
+ }
+ return 0;
+}
+
static void io_release_dmabuf(struct io_zcrx_mem *mem)
{
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
@@ -76,6 +107,8 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
int dmabuf_fd = area_reg->dmabuf_fd;
int i, ret;
+ if (off)
+ return -EINVAL;
if (WARN_ON_ONCE(!ifq->dev))
return -EFAULT;
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
@@ -106,8 +139,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
for_each_sgtable_dma_sg(mem->sgt, sg, i)
total_size += sg_dma_len(sg);
- if (total_size < off + len)
- return -EINVAL;
+ if (total_size != len) {
+ ret = -EINVAL;
+ goto err;
+ }
mem->dmabuf_offset = off;
mem->size = len;
@@ -119,33 +154,27 @@ err:
static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
- unsigned long off = area->mem.dmabuf_offset;
- struct scatterlist *sg;
- unsigned i, niov_idx = 0;
-
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return -EINVAL;
+ return io_populate_area_dma(ifq, area, area->mem.sgt,
+ area->mem.dmabuf_offset);
+}
- for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
- dma_addr_t dma = sg_dma_address(sg);
- unsigned long sg_len = sg_dma_len(sg);
- unsigned long sg_off = min(sg_len, off);
-
- off -= sg_off;
- sg_len -= sg_off;
- dma += sg_off;
+static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages)
+{
+ struct folio *last_folio = NULL;
+ unsigned long res = 0;
+ int i;
- while (sg_len && niov_idx < area->nia.num_niovs) {
- struct net_iov *niov = &area->nia.niovs[niov_idx];
+ for (i = 0; i < nr_pages; i++) {
+ struct folio *folio = page_folio(pages[i]);
- if (net_mp_niov_set_dma_addr(niov, dma))
- return 0;
- sg_len -= PAGE_SIZE;
- dma += PAGE_SIZE;
- niov_idx++;
- }
+ if (folio == last_folio)
+ continue;
+ last_folio = folio;
+ res += 1UL << folio_order(folio);
}
- return niov_idx;
+ return res;
}
static int io_import_umem(struct io_zcrx_ifq *ifq,
@@ -153,7 +182,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_area_reg *area_reg)
{
struct page **pages;
- int nr_pages;
+ int nr_pages, ret;
if (area_reg->dmabuf_fd)
return -EINVAL;
@@ -164,10 +193,23 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
if (IS_ERR(pages))
return PTR_ERR(pages);
+ ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
+ 0, nr_pages << PAGE_SHIFT,
+ GFP_KERNEL_ACCOUNT);
+ if (ret) {
+ unpin_user_pages(pages, nr_pages);
+ return ret;
+ }
+
+ mem->account_pages = io_count_account_pages(pages, nr_pages);
+ ret = io_account_mem(ifq->ctx, mem->account_pages);
+ if (ret < 0)
+ mem->account_pages = 0;
+
mem->pages = pages;
mem->nr_folios = nr_pages;
mem->size = area_reg->len;
- return 0;
+ return ret;
}
static void io_release_area_mem(struct io_zcrx_mem *mem)
@@ -178,6 +220,7 @@ static void io_release_area_mem(struct io_zcrx_mem *mem)
}
if (mem->pages) {
unpin_user_pages(mem->pages, mem->nr_folios);
+ sg_free_table(&mem->page_sg_table);
kvfree(mem->pages);
}
}
@@ -199,84 +242,54 @@ static int io_import_area(struct io_zcrx_ifq *ifq,
return io_import_umem(ifq, mem, area_reg);
}
-static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
-{
- int i;
-
- for (i = 0; i < nr_mapped; i++) {
- netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
- dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
-
- dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- }
-}
-
-static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
+static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area)
{
int i;
- if (area->mem.is_dmabuf)
- io_release_dmabuf(&area->mem);
- else
- io_zcrx_unmap_umem(ifq, area, nr_mapped);
+ guard(mutex)(&ifq->dma_lock);
+ if (!area->is_mapped)
+ return;
+ area->is_mapped = false;
for (i = 0; i < area->nia.num_niovs; i++)
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
-}
-
-static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
-{
- guard(mutex)(&ifq->dma_lock);
- if (area->is_mapped)
- __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
- area->is_mapped = false;
+ if (area->mem.is_dmabuf) {
+ io_release_dmabuf(&area->mem);
+ } else {
+ dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ }
}
-static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
- int i;
-
- for (i = 0; i < area->nia.num_niovs; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- dma_addr_t dma;
+ int ret;
- dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
- PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
- if (dma_mapping_error(ifq->dev, dma))
- break;
- if (net_mp_niov_set_dma_addr(niov, dma)) {
- dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- break;
- }
- }
- return i;
+ ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ if (ret < 0)
+ return ret;
+ return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0);
}
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
- unsigned nr;
+ int ret;
guard(mutex)(&ifq->dma_lock);
if (area->is_mapped)
return 0;
if (area->mem.is_dmabuf)
- nr = io_zcrx_map_area_dmabuf(ifq, area);
+ ret = io_zcrx_map_area_dmabuf(ifq, area);
else
- nr = io_zcrx_map_area_umem(ifq, area);
-
- if (nr != area->nia.num_niovs) {
- __io_zcrx_unmap_area(ifq, area, nr);
- return -EINVAL;
- }
+ ret = io_zcrx_map_area_umem(ifq, area);
- area->is_mapped = true;
- return 0;
+ if (ret == 0)
+ area->is_mapped = true;
+ return ret;
}
static void io_zcrx_sync_for_device(const struct page_pool *pool,
@@ -369,6 +382,9 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
io_zcrx_unmap_area(area->ifq, area);
io_release_area_mem(&area->mem);
+ if (area->mem.account_pages)
+ io_unaccount_mem(area->ifq->ctx, area->mem.account_pages);
+
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
@@ -396,6 +412,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
+ area->ifq = ifq;
ret = io_import_area(ifq, &area->mem, area_reg);
if (ret)
@@ -430,7 +447,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
}
area->free_count = nr_iovs;
- area->ifq = ifq;
/* we're only supporting one area per ifq for now */
area->area_id = 0;
area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
@@ -631,12 +647,13 @@ ifq_free:
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
- unsigned long id;
lockdep_assert_held(&ctx->uring_lock);
while (1) {
scoped_guard(mutex, &ctx->mmap_lock) {
+ unsigned long id = 0;
+
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
if (ifq)
xa_erase(&ctx->zcrx_ctxs, id);
@@ -859,10 +876,7 @@ static int io_pp_zc_init(struct page_pool *pp)
static void io_pp_zc_destroy(struct page_pool *pp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
- struct io_zcrx_area *area = ifq->area;
- if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
- return;
percpu_ref_put(&ifq->ctx->refs);
}
@@ -940,9 +954,54 @@ static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
return niov;
}
+struct io_copy_cache {
+ struct page *page;
+ unsigned long offset;
+ size_t size;
+};
+
+static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page,
+ unsigned int src_offset, size_t len)
+{
+ size_t copied = 0;
+
+ len = min(len, cc->size);
+
+ while (len) {
+ void *src_addr, *dst_addr;
+ struct page *dst_page = cc->page;
+ unsigned dst_offset = cc->offset;
+ size_t n = len;
+
+ if (folio_test_partial_kmap(page_folio(dst_page)) ||
+ folio_test_partial_kmap(page_folio(src_page))) {
+ dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE);
+ dst_offset = offset_in_page(dst_offset);
+ src_page = nth_page(src_page, src_offset / PAGE_SIZE);
+ src_offset = offset_in_page(src_offset);
+ n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset);
+ n = min(n, len);
+ }
+
+ dst_addr = kmap_local_page(dst_page) + dst_offset;
+ src_addr = kmap_local_page(src_page) + src_offset;
+
+ memcpy(dst_addr, src_addr, n);
+
+ kunmap_local(src_addr);
+ kunmap_local(dst_addr);
+
+ cc->size -= n;
+ cc->offset += n;
+ len -= n;
+ copied += n;
+ }
+ return copied;
+}
+
static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
- void *src_base, struct page *src_page,
- unsigned int src_offset, size_t len)
+ struct page *src_page, unsigned int src_offset,
+ size_t len)
{
struct io_zcrx_area *area = ifq->area;
size_t copied = 0;
@@ -952,11 +1011,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
return -EFAULT;
while (len) {
- size_t copy_size = min_t(size_t, PAGE_SIZE, len);
- const int dst_off = 0;
+ struct io_copy_cache cc;
struct net_iov *niov;
- struct page *dst_page;
- void *dst_addr;
+ size_t n;
niov = io_zcrx_alloc_fallback(area);
if (!niov) {
@@ -964,27 +1021,22 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
break;
}
- dst_page = io_zcrx_iov_page(niov);
- dst_addr = kmap_local_page(dst_page);
- if (src_page)
- src_base = kmap_local_page(src_page);
-
- memcpy(dst_addr, src_base + src_offset, copy_size);
+ cc.page = io_zcrx_iov_page(niov);
+ cc.offset = 0;
+ cc.size = PAGE_SIZE;
- if (src_page)
- kunmap_local(src_base);
- kunmap_local(dst_addr);
+ n = io_copy_page(&cc, src_page, src_offset, len);
- if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) {
+ if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) {
io_zcrx_return_niov(niov);
ret = -ENOSPC;
break;
}
io_zcrx_get_niov_uref(niov);
- src_offset += copy_size;
- len -= copy_size;
- copied += copy_size;
+ src_offset += n;
+ len -= n;
+ copied += n;
}
return copied ? copied : ret;
@@ -994,19 +1046,8 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
const skb_frag_t *frag, int off, int len)
{
struct page *page = skb_frag_page(frag);
- u32 p_off, p_len, t, copied = 0;
- int ret = 0;
- off += skb_frag_off(frag);
-
- skb_frag_foreach_page(frag, off, len,
- page, p_off, p_len, t) {
- ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len);
- if (ret < 0)
- return copied ? copied : ret;
- copied += ret;
- }
- return copied;
+ return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
}
static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
@@ -1063,8 +1104,9 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
size_t to_copy;
to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
- copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL,
- offset, to_copy);
+ copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data),
+ offset_in_page(skb->data) + offset,
+ to_copy);
if (copied < 0) {
ret = copied;
goto out;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 2f5e26389f22..109c4ca36434 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -14,6 +14,8 @@ struct io_zcrx_mem {
struct page **pages;
unsigned long nr_folios;
+ struct sg_table page_sg_table;
+ unsigned long account_pages;
struct dma_buf_attachment *attach;
struct dma_buf *dmabuf;