summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 11:35:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 11:35:31 -0700
commitd2c84bdce25a678c1e1f116d65b58790bd241af0 (patch)
tree45499e5ded0bec5bc0ac7e305ee19198374a02c4 /io_uring
parent0f1a876682f0979d6a1e5f86861dd562d1758936 (diff)
parent606559dc4fa36a954a51fbf1c6c0cc320f551fe0 (diff)
Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Make running of task_work internal loops more fair, and unify how the different methods deal with them (me) - Support for per-ring NAPI. The two minor networking patches are in a shared branch with netdev (Stefan) - Add support for truncate (Tony) - Export SQPOLL utilization stats (Xiaobing) - Multishot fixes (Pavel) - Fix for a race in manipulating the request flags via poll (Pavel) - Cleanup the multishot checking by making it generic, moving it out of opcode handlers (Pavel) - Various tweaks and cleanups (me, Kunwu, Alexander) * tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits) io_uring: Fix sqpoll utilization check racing with dying sqpoll io_uring/net: dedup io_recv_finish req completion io_uring: refactor DEFER_TASKRUN multishot checks io_uring: fix mshot io-wq checks io_uring/net: add io_req_msg_cleanup() helper io_uring/net: simplify msghd->msg_inq checking io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io io_uring/net: correctly handle multishot recvmsg retry setup io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler io_uring: fix io_queue_proc modifying req->flags io_uring: fix mshot read defer taskrun cqe posting io_uring/net: fix overflow check in io_recvmsg_mshot_prep() io_uring/net: correct the type of variable io_uring/sqpoll: statistics of the true utilization of sq threads io_uring/net: move recv/recvmsg flags out of retry loop io_uring/kbuf: flag request if buffer pool is empty after buffer pick io_uring/net: improve the usercopy for sendmsg/recvmsg io_uring/net: move receive multishot out of the generic msghdr path io_uring/net: unify how recvmsg and sendmsg copy in the msghdr ...
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile3
-rw-r--r--io_uring/cancel.c3
-rw-r--r--io_uring/cancel.h10
-rw-r--r--io_uring/fdinfo.c18
-rw-r--r--io_uring/filetable.h2
-rw-r--r--io_uring/io_uring.c249
-rw-r--r--io_uring/io_uring.h77
-rw-r--r--io_uring/kbuf.c35
-rw-r--r--io_uring/kbuf.h61
-rw-r--r--io_uring/napi.c332
-rw-r--r--io_uring/napi.h104
-rw-r--r--io_uring/net.c382
-rw-r--r--io_uring/opdef.c10
-rw-r--r--io_uring/poll.c33
-rw-r--r--io_uring/register.c13
-rw-r--r--io_uring/rsrc.h2
-rw-r--r--io_uring/rw.c13
-rw-r--r--io_uring/sqpoll.c59
-rw-r--r--io_uring/sqpoll.h1
-rw-r--r--io_uring/truncate.c48
-rw-r--r--io_uring/truncate.h4
-rw-r--r--io_uring/uring_cmd.c1
-rw-r--r--io_uring/xattr.c2
23 files changed, 1067 insertions, 395 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 2cdc51825405..2e1d4e03799c 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
- notif.o waitid.o register.o
+ notif.o waitid.o register.o truncate.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 8a8b07dfc444..acfcdd7f059a 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -58,9 +58,8 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
check_seq:
- if (cd->seq == req->work.cancel_seq)
+ if (io_cancel_match_sequence(req, cd->seq))
return false;
- req->work.cancel_seq = cd->seq;
}
return true;
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index c0a8e7c520b6..76b32e65c03c 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -25,4 +25,14 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
+static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence)
+{
+ if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq)
+ return true;
+
+ req->flags |= REQ_F_CANCEL_SEQ;
+ req->work.cancel_seq = sequence;
+ return false;
+}
+
#endif
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 976e9500f651..8d444dd1b0a7 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -55,6 +55,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
struct io_ring_ctx *ctx = f->private_data;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
+ struct rusage sq_usage;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
unsigned int sq_head = READ_ONCE(r->sq.head);
unsigned int sq_tail = READ_ONCE(r->sq.tail);
@@ -64,6 +65,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
unsigned int sq_shift = 0;
unsigned int sq_entries, cq_entries;
int sq_pid = -1, sq_cpu = -1;
+ u64 sq_total_time = 0, sq_work_time = 0;
bool has_lock;
unsigned int i;
@@ -145,12 +147,24 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
struct io_sq_data *sq = ctx->sq_data;
- sq_pid = sq->task_pid;
- sq_cpu = sq->sq_cpu;
+ /*
+ * sq->thread might be NULL if we raced with the sqpoll
+ * thread termination.
+ */
+ if (sq->thread) {
+ sq_pid = sq->task_pid;
+ sq_cpu = sq->sq_cpu;
+ getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
+ sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+ + sq_usage.ru_stime.tv_usec);
+ sq_work_time = sq->work_time;
+ }
}
seq_printf(m, "SqThread:\t%d\n", sq_pid);
seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
+ seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
+ seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct file *f = io_file_from_index(&ctx->file_table, i);
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index b47adf170c31..b2435c4dca1f 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg);
-unsigned int io_file_get_flags(struct file *file);
+io_req_flags_t io_file_get_flags(struct file *file);
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
{
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cd9a137ad6ce..cf348c33f485 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -59,7 +59,6 @@
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
-#include <net/af_unix.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -95,6 +94,7 @@
#include "notif.h"
#include "waitid.h"
#include "futex.h"
+#include "napi.h"
#include "timeout.h"
#include "poll.h"
@@ -122,11 +122,6 @@
#define IO_COMPL_BATCH 32
#define IO_REQ_ALLOC_BATCH 8
-enum {
- IO_CHECK_CQ_OVERFLOW_BIT,
- IO_CHECK_CQ_DROPPED_BIT,
-};
-
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -349,6 +344,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
+ io_napi_init(ctx);
+
return ctx;
err:
kfree(ctx->cancel_table.hbs);
@@ -463,7 +460,6 @@ static void io_prep_async_work(struct io_kiocb *req)
req->work.list.next = NULL;
req->work.flags = 0;
- req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -670,7 +666,6 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
io_commit_cqring_flush(ctx);
}
-/* Returns true if there are no backlogged entries after the flush */
static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{
struct io_overflow_cqe *ocqe;
@@ -949,6 +944,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe;
+ lockdep_assert(!io_wq_current_is_worker());
+
if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
@@ -1025,15 +1022,15 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
- if (req->ctx->task_complete && req->ctx->submitter_task != current) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (ctx->task_complete && ctx->submitter_task != current) {
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
- !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
+ !(ctx->flags & IORING_SETUP_IOPOLL)) {
__io_req_complete_post(req, issue_flags);
} else {
- struct io_ring_ctx *ctx = req->ctx;
-
mutex_lock(&ctx->uring_lock);
__io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
mutex_unlock(&ctx->uring_lock);
@@ -1174,40 +1171,44 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
percpu_ref_put(&ctx->refs);
}
-static unsigned int handle_tw_list(struct llist_node *node,
- struct io_ring_ctx **ctx,
- struct io_tw_state *ts,
- struct llist_node *last)
+/*
+ * Run queued task_work, returning the number of entries processed in *count.
+ * If more entries than max_entries are available, stop processing once this
+ * is reached and return the rest of the list.
+ */
+struct llist_node *io_handle_tw_list(struct llist_node *node,
+ unsigned int *count,
+ unsigned int max_entries)
{
- unsigned int count = 0;
+ struct io_ring_ctx *ctx = NULL;
+ struct io_tw_state ts = { };
- while (node && node != last) {
+ do {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- ctx_flush_and_put(*ctx, ts);
- *ctx = req->ctx;
+ if (req->ctx != ctx) {
+ ctx_flush_and_put(ctx, &ts);
+ ctx = req->ctx;
/* if not contended, grab and improve batching */
- ts->locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
+ ts.locked = mutex_trylock(&ctx->uring_lock);
+ percpu_ref_get(&ctx->refs);
}
INDIRECT_CALL_2(req->io_task_work.func,
io_poll_task_func, io_req_rw_complete,
- req, ts);
+ req, &ts);
node = next;
- count++;
+ (*count)++;
if (unlikely(need_resched())) {
- ctx_flush_and_put(*ctx, ts);
- *ctx = NULL;
+ ctx_flush_and_put(ctx, &ts);
+ ctx = NULL;
cond_resched();
}
- }
+ } while (node && *count < max_entries);
- return count;
+ ctx_flush_and_put(ctx, &ts);
+ return node;
}
/**
@@ -1224,22 +1225,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head,
return xchg(&head->first, new);
}
-/**
- * io_llist_cmpxchg - possibly swap all entries in a lock-less list
- * @head: the head of lock-less list to delete all entries
- * @old: expected old value of the first entry of the list
- * @new: new entry as the head of the list
- *
- * perform a cmpxchg on the first entry of the list.
- */
-
-static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
- struct llist_node *old,
- struct llist_node *new)
-{
- return cmpxchg(&head->first, old, new);
-}
-
static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{
struct llist_node *node = llist_del_all(&tctx->task_list);
@@ -1268,45 +1253,41 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
}
}
-void tctx_task_work(struct callback_head *cb)
+struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
+ unsigned int max_entries,
+ unsigned int *count)
{
- struct io_tw_state ts = {};
- struct io_ring_ctx *ctx = NULL;
- struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
- task_work);
- struct llist_node fake = {};
struct llist_node *node;
- unsigned int loops = 0;
- unsigned int count = 0;
if (unlikely(current->flags & PF_EXITING)) {
io_fallback_tw(tctx, true);
- return;
+ return NULL;
}
- do {
- loops++;
- node = io_llist_xchg(&tctx->task_list, &fake);
- count += handle_tw_list(node, &ctx, &ts, &fake);
-
- /* skip expensive cmpxchg if there are items in the list */
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
- io_submit_flush_completions(ctx);
- if (READ_ONCE(tctx->task_list.first) != &fake)
- continue;
- }
- node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
- } while (node != &fake);
-
- ctx_flush_and_put(ctx, &ts);
+ node = llist_del_all(&tctx->task_list);
+ if (node) {
+ node = llist_reverse_order(node);
+ node = io_handle_tw_list(node, count, max_entries);
+ }
/* relaxed read is enough as only the task itself sets ->in_cancel */
if (unlikely(atomic_read(&tctx->in_cancel)))
io_uring_drop_tctx_refs(current);
- trace_io_uring_task_work_run(tctx, count, loops);
+ trace_io_uring_task_work_run(tctx, *count);
+ return node;
+}
+
+void tctx_task_work(struct callback_head *cb)
+{
+ struct io_uring_task *tctx;
+ struct llist_node *ret;
+ unsigned int count = 0;
+
+ tctx = container_of(cb, struct io_uring_task, task_work);
+ ret = tctx_task_work_run(tctx, UINT_MAX, &count);
+ /* can't happen */
+ WARN_ON_ONCE(ret);
}
static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
@@ -1389,6 +1370,15 @@ static void io_req_normal_work_add(struct io_kiocb *req)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ /* SQPOLL doesn't need the task_work added, it'll run it itself */
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (wq_has_sleeper(&sqd->wait))
+ wake_up(&sqd->wait);
+ return;
+ }
+
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
@@ -1420,7 +1410,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
}
}
-static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
+static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
+ int min_events)
+{
+ if (llist_empty(&ctx->work_llist))
+ return false;
+ if (events < min_events)
+ return true;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ return false;
+}
+
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
+ int min_events)
{
struct llist_node *node;
unsigned int loops = 0;
@@ -1440,7 +1443,6 @@ again:
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
INDIRECT_CALL_2(req->io_task_work.func,
io_poll_task_func, io_req_rw_complete,
req, ts);
@@ -1449,18 +1451,20 @@ again:
}
loops++;
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
if (ts->locked) {
io_submit_flush_completions(ctx);
- if (!llist_empty(&ctx->work_llist))
+ if (io_run_local_work_continue(ctx, ret, min_events))
goto again;
}
+
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
}
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
+ int min_events)
{
struct io_tw_state ts = { .locked = true, };
int ret;
@@ -1468,20 +1472,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
if (llist_empty(&ctx->work_llist))
return 0;
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
/* shouldn't happen! */
if (WARN_ON_ONCE(!ts.locked))
mutex_lock(&ctx->uring_lock);
return ret;
}
-static int io_run_local_work(struct io_ring_ctx *ctx)
+static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{
struct io_tw_state ts = {};
int ret;
ts.locked = mutex_trylock(&ctx->uring_lock);
- ret = __io_run_local_work(ctx, &ts);
+ ret = __io_run_local_work(ctx, &ts, min_events);
if (ts.locked)
mutex_unlock(&ctx->uring_lock);
@@ -1677,7 +1681,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
- (void) io_run_local_work_locked(ctx);
+ (void) io_run_local_work_locked(ctx, min);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
@@ -1768,9 +1772,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
}
}
-unsigned int io_file_get_flags(struct file *file)
+io_req_flags_t io_file_get_flags(struct file *file)
{
- unsigned int res = 0;
+ io_req_flags_t res = 0;
if (S_ISREG(file_inode(file)->i_mode))
res |= REQ_F_ISREG;
@@ -1966,10 +1970,28 @@ fail:
goto fail;
}
+ /*
+ * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the
+ * submitter task context. Final request completions are handed to the
+ * right context, however this is not the case of auxiliary CQEs,
+ * which is the main mean of operation for multishot requests.
+ * Don't allow any multishot execution from io-wq. It's more restrictive
+ * than necessary and also cleaner.
+ */
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ err = -EBADFD;
+ if (!io_file_can_poll(req))
+ goto fail;
+ err = -ECANCELED;
+ if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
+ goto fail;
+ return;
+ }
+
if (req->flags & REQ_F_FORCE_ASYNC) {
bool opcode_poll = def->pollin || def->pollout;
- if (opcode_poll && file_can_poll(req->file)) {
+ if (opcode_poll && io_file_can_poll(req)) {
needs_poll = true;
issue_flags |= IO_URING_F_NONBLOCK;
}
@@ -2171,7 +2193,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* req is partially pre-initialised, see io_preinit_req() */
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags = sqe_flags = READ_ONCE(sqe->flags);
+ sqe_flags = READ_ONCE(sqe->flags);
+ req->flags = (io_req_flags_t) sqe_flags;
req->cqe.user_data = READ_ONCE(sqe->user_data);
req->file = NULL;
req->rsrc_node = NULL;
@@ -2475,33 +2498,6 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
return ret;
}
-struct io_wait_queue {
- struct wait_queue_entry wq;
- struct io_ring_ctx *ctx;
- unsigned cq_tail;
- unsigned nr_timeouts;
- ktime_t timeout;
-};
-
-static inline bool io_has_work(struct io_ring_ctx *ctx)
-{
- return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
- !llist_empty(&ctx->work_llist);
-}
-
-static inline bool io_should_wake(struct io_wait_queue *iowq)
-{
- struct io_ring_ctx *ctx = iowq->ctx;
- int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
-
- /*
- * Wake up if we have enough events, or if a timeout occurred since we
- * started waiting. For timeouts, we always want to return to userspace,
- * regardless of event count.
- */
- return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
-}
-
static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
int wake_flags, void *key)
{
@@ -2520,7 +2516,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING);
- if (io_run_local_work(ctx) > 0)
+ if (io_run_local_work(ctx, INT_MAX) > 0)
return 0;
}
if (io_run_task_work() > 0)
@@ -2588,7 +2584,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, min_events);
io_run_task_work();
io_cqring_overflow_flush(ctx);
/* if user messes with these they will just get an early return */
@@ -2621,16 +2617,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (get_timespec64(&ts, uts))
return -EFAULT;
+
iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ io_napi_adjust_timeout(ctx, &iowq, &ts);
}
+ io_napi_busy_loop(ctx, &iowq);
+
trace_io_uring_cqring_wait(ctx, min_events);
do {
+ int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
unsigned long check_cq;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
-
atomic_set(&ctx->cq_wait_nr, nr_wait);
set_current_state(TASK_INTERRUPTIBLE);
} else {
@@ -2649,7 +2648,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
*/
io_run_task_work();
if (!llist_empty(&ctx->work_llist))
- io_run_local_work(ctx);
+ io_run_local_work(ctx, nr_wait);
/*
* Non-local task_work will be run on exit to userspace, but
@@ -2917,6 +2916,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_req_caches_free(ctx);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
+ io_napi_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->io_bl);
@@ -3304,7 +3304,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
- ret |= io_run_local_work(ctx) > 0;
+ ret |= io_run_local_work(ctx, INT_MAX) > 0;
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
@@ -3666,7 +3666,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
* it should handle ownership problems if any.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
- (void)io_run_local_work_locked(ctx);
+ (void)io_run_local_work_locked(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
}
@@ -4153,7 +4153,7 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
- BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
+ BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags));
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
@@ -4175,9 +4175,8 @@ static int __init io_uring_init(void)
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
- io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
- NULL);
+ io_buf_cachep = KMEM_CACHE(io_buffer,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d5495710c178..6426ee382276 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -5,6 +5,7 @@
#include <linux/lockdep.h>
#include <linux/resume_user_mode.h>
#include <linux/kasan.h>
+#include <linux/poll.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
@@ -34,6 +35,32 @@ enum {
IOU_STOP_MULTISHOT = -ECANCELED,
};
+struct io_wait_queue {
+ struct wait_queue_entry wq;
+ struct io_ring_ctx *ctx;
+ unsigned cq_tail;
+ unsigned nr_timeouts;
+ ktime_t timeout;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned int napi_busy_poll_to;
+ bool napi_prefer_busy_poll;
+#endif
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+ struct io_ring_ctx *ctx = iowq->ctx;
+ int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
+
+ /*
+ * Wake up if we have enough events, or if a timeout occurred since we
+ * started waiting. For timeouts, we always want to return to userspace,
+ * regardless of event count.
+ */
+ return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
@@ -56,6 +83,8 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
+struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries);
+struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
@@ -207,7 +236,7 @@ static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
unsigned issue_flags)
{
lockdep_assert_held(&ctx->uring_lock);
- if (issue_flags & IO_URING_F_UNLOCKED)
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
mutex_unlock(&ctx->uring_lock);
}
@@ -220,7 +249,7 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
* The only exception is when we've detached the request and issue it
* from an async worker thread, grab the lock for that case.
*/
- if (issue_flags & IO_URING_F_UNLOCKED)
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
mutex_lock(&ctx->uring_lock);
lockdep_assert_held(&ctx->uring_lock);
}
@@ -274,6 +303,8 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline int io_run_task_work(void)
{
+ bool ret = false;
+
/*
* Always check-and-clear the task_work notification signal. With how
* signaling works for task_work, we can find it set with nothing to
@@ -285,18 +316,26 @@ static inline int io_run_task_work(void)
* PF_IO_WORKER never returns to userspace, so check here if we have
* notify work that needs processing.
*/
- if (current->flags & PF_IO_WORKER &&
- test_thread_flag(TIF_NOTIFY_RESUME)) {
- __set_current_state(TASK_RUNNING);
- resume_user_mode_work(NULL);
+ if (current->flags & PF_IO_WORKER) {
+ if (test_thread_flag(TIF_NOTIFY_RESUME)) {
+ __set_current_state(TASK_RUNNING);
+ resume_user_mode_work(NULL);
+ }
+ if (current->io_uring) {
+ unsigned int count = 0;
+
+ tctx_task_work_run(current->io_uring, UINT_MAX, &count);
+ if (count)
+ ret = true;
+ }
}
if (task_work_pending(current)) {
__set_current_state(TASK_RUNNING);
task_work_run();
- return 1;
+ ret = true;
}
- return 0;
+ return ret;
}
static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
@@ -398,4 +437,26 @@ static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
return 2 * sizeof(struct io_uring_sqe);
return sizeof(struct io_uring_sqe);
}
+
+static inline bool io_file_can_poll(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_CAN_POLL)
+ return true;
+ if (file_can_poll(req->file)) {
+ req->flags |= REQ_F_CAN_POLL;
+ return true;
+ }
+ return false;
+}
+
+enum {
+ IO_CHECK_CQ_OVERFLOW_BIT,
+ IO_CHECK_CQ_DROPPED_BIT,
+};
+
+static inline bool io_has_work(struct io_ring_ctx *ctx)
+{
+ return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
+ !llist_empty(&ctx->work_llist);
+}
#endif
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 18df5a9d2f5e..9be42bff936b 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -81,15 +81,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
struct io_buffer_list *bl;
struct io_buffer *buf;
- /*
- * For legacy provided buffer mode, don't recycle if we already did
- * IO to this buffer. For ring-mapped provided buffer mode, we should
- * increment ring->head to explicitly monopolize the buffer to avoid
- * multiple use.
- */
- if (req->flags & REQ_F_PARTIAL_IO)
- return false;
-
io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
@@ -102,10 +93,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
return true;
}
-unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
+void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
{
- unsigned int cflags;
-
/*
* We can add this buffer back to two lists:
*
@@ -118,21 +107,17 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
- if (req->flags & REQ_F_BUFFER_RING) {
- /* no buffers to recycle for this case */
- cflags = __io_put_kbuf_list(req, NULL);
- } else if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
- cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
+ __io_put_kbuf_list(req, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock);
} else {
lockdep_assert_held(&req->ctx->uring_lock);
- cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
+ __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
}
- return cflags;
}
static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
@@ -145,6 +130,8 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
list_del(&kbuf->list);
if (*len == 0 || *len > kbuf->len)
*len = kbuf->len;
+ if (list_empty(&bl->buf_list))
+ req->flags |= REQ_F_BL_EMPTY;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
req->buf_index = kbuf->bid;
@@ -158,12 +145,16 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
+ __u16 tail, head = bl->head;
struct io_uring_buf *buf;
- __u16 head = bl->head;
- if (unlikely(smp_load_acquire(&br->tail) == head))
+ tail = smp_load_acquire(&br->tail);
+ if (unlikely(tail == head))
return NULL;
+ if (head + 1 == tail)
+ req->flags |= REQ_F_BL_EMPTY;
+
head &= bl->mask;
/* mmaped buffers are always contig */
if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
@@ -180,7 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->buf_list = bl;
req->buf_index = buf->bid;
- if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
+ if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
/*
* If we came in unlocked, we have no choice but to consume the
* buffer here, otherwise nothing ensures that the buffer won't
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 53dfaa71a397..5218bfd79e87 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -57,7 +57,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
-unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
@@ -73,21 +73,9 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
* to monopolize the buffer.
*/
if (req->buf_list) {
- if (req->flags & REQ_F_PARTIAL_IO) {
- /*
- * If we end up here, then the io_uring_lock has
- * been kept held since we retrieved the buffer.
- * For the io-wq case, we already cleared
- * req->buf_list when the buffer was retrieved,
- * hence it cannot be set here for that case.
- */
- req->buf_list->head++;
- req->buf_list = NULL;
- } else {
- req->buf_index = req->buf_list->bgid;
- req->flags &= ~REQ_F_BUFFER_RING;
- return true;
- }
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ return true;
}
return false;
}
@@ -101,6 +89,8 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
+ if (req->flags & REQ_F_BL_NO_RECYCLE)
+ return false;
if (req->flags & REQ_F_BUFFER_SELECTED)
return io_kbuf_recycle_legacy(req, issue_flags);
if (req->flags & REQ_F_BUFFER_RING)
@@ -108,41 +98,54 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false;
}
-static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
- struct list_head *list)
+static inline void __io_put_kbuf_ring(struct io_kiocb *req)
{
- unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+ if (req->buf_list) {
+ req->buf_index = req->buf_list->bgid;
+ req->buf_list->head++;
+ }
+ req->flags &= ~REQ_F_BUFFER_RING;
+}
+static inline void __io_put_kbuf_list(struct io_kiocb *req,
+ struct list_head *list)
+{
if (req->flags & REQ_F_BUFFER_RING) {
- if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
- req->buf_list->head++;
- }
- req->flags &= ~REQ_F_BUFFER_RING;
+ __io_put_kbuf_ring(req);
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
}
-
- return ret;
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
+ unsigned int ret;
+
lockdep_assert_held(&req->ctx->completion_lock);
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
- return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
+
+ ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+ __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
+ return ret;
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
unsigned issue_flags)
{
+ unsigned int ret;
- if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
+ if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbuf(req, issue_flags);
+
+ ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+ if (req->flags & REQ_F_BUFFER_RING)
+ __io_put_kbuf_ring(req);
+ else
+ __io_put_kbuf(req, issue_flags);
+ return ret;
}
#endif
diff --git a/io_uring/napi.c b/io_uring/napi.c
new file mode 100644
index 000000000000..883a1a665907
--- /dev/null
+++ b/io_uring/napi.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "io_uring.h"
+#include "napi.h"
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+/* Timeout for cleanout of stale entries. */
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct io_napi_entry {
+ unsigned int napi_id;
+ struct list_head list;
+
+ unsigned long timeout;
+ struct hlist_node node;
+
+ struct rcu_head rcu;
+};
+
+static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
+ unsigned int napi_id)
+{
+ struct io_napi_entry *e;
+
+ hlist_for_each_entry_rcu(e, hash_list, node) {
+ if (e->napi_id != napi_id)
+ continue;
+ e->timeout = jiffies + NAPI_TIMEOUT;
+ return e;
+ }
+
+ return NULL;
+}
+
+void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
+{
+ struct hlist_head *hash_list;
+ unsigned int napi_id;
+ struct sock *sk;
+ struct io_napi_entry *e;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+
+ /* Non-NAPI IDs can be rejected. */
+ if (napi_id < MIN_NAPI_ID)
+ return;
+
+ hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+
+ rcu_read_lock();
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (e) {
+ e->timeout = jiffies + NAPI_TIMEOUT;
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ e = kmalloc(sizeof(*e), GFP_NOWAIT);
+ if (!e)
+ return;
+
+ e->napi_id = napi_id;
+ e->timeout = jiffies + NAPI_TIMEOUT;
+
+ spin_lock(&ctx->napi_lock);
+ if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
+ spin_unlock(&ctx->napi_lock);
+ kfree(e);
+ return;
+ }
+
+ hlist_add_tail_rcu(&e->node, hash_list);
+ list_add_tail(&e->list, &ctx->napi_list);
+ spin_unlock(&ctx->napi_lock);
+}
+
+static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
+{
+ struct io_napi_entry *e;
+ unsigned int i;
+
+ spin_lock(&ctx->napi_lock);
+ hash_for_each(ctx->napi_ht, i, e, node) {
+ if (time_after(jiffies, e->timeout)) {
+ list_del(&e->list);
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ }
+ }
+ spin_unlock(&ctx->napi_lock);
+}
+
+static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
+{
+ if (is_stale)
+ __io_napi_remove_stale(ctx);
+}
+
+static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
+ unsigned long bp_usec)
+{
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ }
+
+ return true;
+}
+
+static bool io_napi_busy_loop_should_end(void *data,
+ unsigned long start_time)
+{
+ struct io_wait_queue *iowq = data;
+
+ if (signal_pending(current))
+ return true;
+ if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+ return true;
+ if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
+ return true;
+
+ return false;
+}
+
+static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+ void *loop_end_arg)
+{
+ struct io_napi_entry *e;
+ bool (*loop_end)(void *, unsigned long) = NULL;
+ bool is_stale = false;
+
+ if (loop_end_arg)
+ loop_end = io_napi_busy_loop_should_end;
+
+ list_for_each_entry_rcu(e, &ctx->napi_list, list) {
+ napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+ ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+
+ if (time_after(jiffies, e->timeout))
+ is_stale = true;
+ }
+
+ return is_stale;
+}
+
+static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time = busy_loop_current_time();
+ void *loop_end_arg = NULL;
+ bool is_stale = false;
+
+ /* Singular lists use a different napi loop end check function and are
+ * only executed once.
+ */
+ if (list_is_singular(&ctx->napi_list))
+ loop_end_arg = iowq;
+
+ rcu_read_lock();
+ do {
+ is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
+ } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
+ rcu_read_unlock();
+
+ io_napi_remove_stale(ctx, is_stale);
+}
+
+/*
+ * io_napi_init() - Init napi settings
+ * @ctx: pointer to io-uring context structure
+ *
+ * Init napi settings in the io-uring context.
+ */
+void io_napi_init(struct io_ring_ctx *ctx)
+{
+ INIT_LIST_HEAD(&ctx->napi_list);
+ spin_lock_init(&ctx->napi_lock);
+ ctx->napi_prefer_busy_poll = false;
+ ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+}
+
+/*
+ * io_napi_free() - Deallocate napi
+ * @ctx: pointer to io-uring context structure
+ *
+ * Free the napi list and the hash table in the io-uring context.
+ */
+void io_napi_free(struct io_ring_ctx *ctx)
+{
+ struct io_napi_entry *e;
+ LIST_HEAD(napi_list);
+ unsigned int i;
+
+ spin_lock(&ctx->napi_lock);
+ hash_for_each(ctx->napi_ht, i, e, node) {
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ }
+ spin_unlock(&ctx->napi_lock);
+}
+
+/*
+ * io_napi_register() - Register napi with io-uring
+ * @ctx: pointer to io-uring context structure
+ * @arg: pointer to io_uring_napi structure
+ *
+ * Register napi in the io-uring context.
+ */
+int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
+{
+ const struct io_uring_napi curr = {
+ .busy_poll_to = ctx->napi_busy_poll_to,
+ .prefer_busy_poll = ctx->napi_prefer_busy_poll
+ };
+ struct io_uring_napi napi;
+
+ if (copy_from_user(&napi, arg, sizeof(napi)))
+ return -EFAULT;
+ if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+ return -EINVAL;
+
+ if (copy_to_user(arg, &curr, sizeof(curr)))
+ return -EFAULT;
+
+ WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
+ WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
+ WRITE_ONCE(ctx->napi_enabled, true);
+ return 0;
+}
+
+/*
+ * io_napi_unregister() - Unregister napi with io-uring
+ * @ctx: pointer to io-uring context structure
+ * @arg: pointer to io_uring_napi structure
+ *
+ * Unregister napi. If arg has been specified copy the busy poll timeout and
+ * prefer busy poll setting to the passed in structure.
+ */
+int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
+{
+ const struct io_uring_napi curr = {
+ .busy_poll_to = ctx->napi_busy_poll_to,
+ .prefer_busy_poll = ctx->napi_prefer_busy_poll
+ };
+
+ if (arg && copy_to_user(arg, &curr, sizeof(curr)))
+ return -EFAULT;
+
+ WRITE_ONCE(ctx->napi_busy_poll_to, 0);
+ WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
+ WRITE_ONCE(ctx->napi_enabled, false);
+ return 0;
+}
+
+/*
+ * __io_napi_adjust_timeout() - Add napi id to the busy poll list
+ * @ctx: pointer to io-uring context structure
+ * @iowq: pointer to io wait queue
+ * @ts: pointer to timespec or NULL
+ *
+ * Adjust the busy loop timeout according to timespec and busy poll timeout.
+ */
+void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
+ struct timespec64 *ts)
+{
+ unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
+
+ if (ts) {
+ struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
+
+ if (timespec64_compare(ts, &poll_to_ts) > 0) {
+ *ts = timespec64_sub(*ts, poll_to_ts);
+ } else {
+ u64 to = timespec64_to_ns(ts);
+
+ do_div(to, 1000);
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ }
+ }
+
+ iowq->napi_busy_poll_to = poll_to;
+}
+
+/*
+ * __io_napi_busy_loop() - execute busy poll loop
+ * @ctx: pointer to io-uring context structure
+ * @iowq: pointer to io wait queue
+ *
+ * Execute the busy poll loop and merge the spliced off list.
+ */
+void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
+{
+ iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+
+ if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
+ io_napi_blocking_busy_loop(ctx, iowq);
+}
+
+/*
+ * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
+ * @ctx: pointer to io-uring context structure
+ *
+ * Splice of the napi list and execute the napi busy poll loop.
+ */
+int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
+{
+ LIST_HEAD(napi_list);
+ bool is_stale = false;
+
+ if (!READ_ONCE(ctx->napi_busy_poll_to))
+ return 0;
+ if (list_empty_careful(&ctx->napi_list))
+ return 0;
+
+ rcu_read_lock();
+ is_stale = __io_napi_do_busy_loop(ctx, NULL);
+ rcu_read_unlock();
+
+ io_napi_remove_stale(ctx, is_stale);
+ return 1;
+}
+
+#endif
diff --git a/io_uring/napi.h b/io_uring/napi.h
new file mode 100644
index 000000000000..6fc0393d0dbe
--- /dev/null
+++ b/io_uring/napi.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef IOU_NAPI_H
+#define IOU_NAPI_H
+
+#include <linux/kernel.h>
+#include <linux/io_uring.h>
+#include <net/busy_poll.h>
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+void io_napi_init(struct io_ring_ctx *ctx);
+void io_napi_free(struct io_ring_ctx *ctx);
+
+int io_register_napi(struct io_ring_ctx *ctx, void __user *arg);
+int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
+
+void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
+
+void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq, struct timespec64 *ts);
+void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
+int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
+
+static inline bool io_napi(struct io_ring_ctx *ctx)
+{
+ return !list_empty(&ctx->napi_list);
+}
+
+static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq,
+ struct timespec64 *ts)
+{
+ if (!io_napi(ctx))
+ return;
+ __io_napi_adjust_timeout(ctx, iowq, ts);
+}
+
+static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq)
+{
+ if (!io_napi(ctx))
+ return;
+ __io_napi_busy_loop(ctx, iowq);
+}
+
+/*
+ * io_napi_add() - Add napi id to the busy poll list
+ * @req: pointer to io_kiocb request
+ *
+ * Add the napi id of the socket to the napi busy poll list and hash table.
+ */
+static inline void io_napi_add(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct socket *sock;
+
+ if (!READ_ONCE(ctx->napi_busy_poll_to))
+ return;
+
+ sock = sock_from_file(req->file);
+ if (sock)
+ __io_napi_add(ctx, sock);
+}
+
+#else
+
+static inline void io_napi_init(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_napi_free(struct io_ring_ctx *ctx)
+{
+}
+static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+static inline bool io_napi(struct io_ring_ctx *ctx)
+{
+ return false;
+}
+static inline void io_napi_add(struct io_kiocb *req)
+{
+}
+static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq,
+ struct timespec64 *ts)
+{
+}
+static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq)
+{
+}
+static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
+{
+ return 0;
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+#endif
diff --git a/io_uring/net.c b/io_uring/net.c
index 161622029147..19451f0dbf81 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -78,19 +78,6 @@ struct io_sr_msg {
*/
#define MULTISHOT_MAX_RETRY 32
-static inline bool io_check_multishot(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- /*
- * When ->locked_cq is set we only allow to post CQEs from the original
- * task context. Usual request completions will be handled in other
- * generic paths but multipoll may decide to post extra cqes.
- */
- return !(issue_flags & IO_URING_F_IOWQ) ||
- !(issue_flags & IO_URING_F_MULTISHOT) ||
- !req->ctx->task_complete;
-}
-
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
@@ -204,16 +191,130 @@ static int io_setup_async_msg(struct io_kiocb *req,
return -EAGAIN;
}
+#ifdef CONFIG_COMPAT
+static int io_compat_msg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg,
+ struct compat_msghdr *msg, int ddir)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct compat_iovec __user *uiov;
+ int ret;
+
+ if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
+ return -EFAULT;
+
+ uiov = compat_ptr(msg->msg_iov);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ compat_ssize_t clen;
+
+ iomsg->free_iov = NULL;
+ if (msg->msg_iovlen == 0) {
+ sr->len = 0;
+ } else if (msg->msg_iovlen > 1) {
+ return -EINVAL;
+ } else {
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+ sr->len = clen;
+ }
+
+ return 0;
+ }
+
+ iomsg->free_iov = iomsg->fast_iov;
+ ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
+ UIO_FASTIOV, &iomsg->free_iov,
+ &iomsg->msg.msg_iter, true);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return 0;
+}
+#endif
+
+static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
+ struct user_msghdr *msg, int ddir)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int ret;
+
+ if (!user_access_begin(sr->umsg, sizeof(*sr->umsg)))
+ return -EFAULT;
+
+ ret = -EFAULT;
+ unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end);
+ unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end);
+ unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end);
+ unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end);
+ unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end);
+ unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end);
+ msg->msg_flags = 0;
+
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (msg->msg_iovlen == 0) {
+ sr->len = iomsg->fast_iov[0].iov_len = 0;
+ iomsg->fast_iov[0].iov_base = NULL;
+ iomsg->free_iov = NULL;
+ } else if (msg->msg_iovlen > 1) {
+ ret = -EINVAL;
+ goto ua_end;
+ } else {
+ /* we only need the length for provided buffers */
+ if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t)))
+ goto ua_end;
+ unsafe_get_user(iomsg->fast_iov[0].iov_len,
+ &msg->msg_iov[0].iov_len, ua_end);
+ sr->len = iomsg->fast_iov[0].iov_len;
+ iomsg->free_iov = NULL;
+ }
+ ret = 0;
+ua_end:
+ user_access_end();
+ return ret;
+ }
+
+ user_access_end();
+ iomsg->free_iov = iomsg->fast_iov;
+ ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV,
+ &iomsg->free_iov, &iomsg->msg.msg_iter, false);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return 0;
+}
+
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct user_msghdr msg;
int ret;
iomsg->msg.msg_name = &iomsg->addr;
- iomsg->free_iov = iomsg->fast_iov;
- ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
- &iomsg->free_iov);
+ iomsg->msg.msg_iter.nr_segs = 0;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(req->ctx->compat)) {
+ struct compat_msghdr cmsg;
+
+ ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE);
+ if (unlikely(ret))
+ return ret;
+
+ return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
+ }
+#endif
+
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE);
+ if (unlikely(ret))
+ return ret;
+
+ ret = __copy_msghdr(&iomsg->msg, &msg, NULL);
+
/* save msg_control as sys_sendmsg() overwrites it */
sr->msg_control = iomsg->msg.msg_control_user;
return ret;
@@ -273,6 +374,8 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ sr->done_io = 0;
+
if (req->opcode == IORING_OP_SEND) {
if (READ_ONCE(sqe->__pad3[0]))
return -EINVAL;
@@ -295,10 +398,20 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- sr->done_io = 0;
return 0;
}
+static void io_req_msg_cleanup(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg,
+ unsigned int issue_flags)
+{
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ /* fast path, check for non-NULL to avoid function call */
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
+ io_netmsg_recycle(req, issue_flags);
+}
+
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -341,18 +454,14 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg->msg.msg_controllen = 0;
kmsg->msg.msg_control = NULL;
sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_BL_NO_RECYCLE;
return io_setup_async_msg(req, kmsg, issue_flags);
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
}
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- io_netmsg_recycle(req, issue_flags);
+ io_req_msg_cleanup(req, kmsg, issue_flags);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
@@ -420,7 +529,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_BL_NO_RECYCLE;
return io_setup_async_addr(req, &__address, issue_flags);
}
if (ret == -ERESTARTSYS)
@@ -435,142 +544,77 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
-static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+static int io_recvmsg_mshot_prep(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg,
+ int namelen, size_t controllen)
{
- int hdr;
-
- if (iomsg->namelen < 0)
- return true;
- if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
- iomsg->namelen, &hdr))
- return true;
- if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
- return true;
+ if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
+ (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
+ int hdr;
+
+ if (unlikely(namelen < 0))
+ return -EOVERFLOW;
+ if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
+ namelen, &hdr))
+ return -EOVERFLOW;
+ if (check_add_overflow(hdr, controllen, &hdr))
+ return -EOVERFLOW;
+
+ iomsg->namelen = namelen;
+ iomsg->controllen = controllen;
+ return 0;
+ }
- return false;
+ return 0;
}
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct user_msghdr msg;
int ret;
- if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
- return -EFAULT;
-
- ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
- if (ret)
- return ret;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- if (msg.msg_iovlen == 0) {
- sr->len = iomsg->fast_iov[0].iov_len = 0;
- iomsg->fast_iov[0].iov_base = NULL;
- iomsg->free_iov = NULL;
- } else if (msg.msg_iovlen > 1) {
- return -EINVAL;
- } else {
- if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
- return -EFAULT;
- sr->len = iomsg->fast_iov[0].iov_len;
- iomsg->free_iov = NULL;
- }
-
- if (req->flags & REQ_F_APOLL_MULTISHOT) {
- iomsg->namelen = msg.msg_namelen;
- iomsg->controllen = msg.msg_controllen;
- if (io_recvmsg_multishot_overflow(iomsg))
- return -EOVERFLOW;
- }
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
- &iomsg->free_iov, &iomsg->msg.msg_iter,
- false);
- if (ret > 0)
- ret = 0;
- }
-
- return ret;
-}
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->msg.msg_iter.nr_segs = 0;
#ifdef CONFIG_COMPAT
-static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct compat_msghdr msg;
- struct compat_iovec __user *uiov;
- int ret;
+ if (unlikely(req->ctx->compat)) {
+ struct compat_msghdr cmsg;
- if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
- return -EFAULT;
-
- ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
- if (ret)
- return ret;
-
- uiov = compat_ptr(msg.msg_iov);
- if (req->flags & REQ_F_BUFFER_SELECT) {
- compat_ssize_t clen;
-
- iomsg->free_iov = NULL;
- if (msg.msg_iovlen == 0) {
- sr->len = 0;
- } else if (msg.msg_iovlen > 1) {
- return -EINVAL;
- } else {
- if (!access_ok(uiov, sizeof(*uiov)))
- return -EFAULT;
- if (__get_user(clen, &uiov->iov_len))
- return -EFAULT;
- if (clen < 0)
- return -EINVAL;
- sr->len = clen;
- }
+ ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST);
+ if (unlikely(ret))
+ return ret;
- if (req->flags & REQ_F_APOLL_MULTISHOT) {
- iomsg->namelen = msg.msg_namelen;
- iomsg->controllen = msg.msg_controllen;
- if (io_recvmsg_multishot_overflow(iomsg))
- return -EOVERFLOW;
- }
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen,
- UIO_FASTIOV, &iomsg->free_iov,
- &iomsg->msg.msg_iter, true);
- if (ret < 0)
+ ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr);
+ if (unlikely(ret))
return ret;
- }
- return 0;
-}
+ return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen,
+ cmsg.msg_controllen);
+ }
#endif
-static int io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->msg.msg_iter.nr_segs = 0;
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST);
+ if (unlikely(ret))
+ return ret;
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return __io_compat_recvmsg_copy_hdr(req, iomsg);
-#endif
+ ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
+ if (unlikely(ret))
+ return ret;
- return __io_recvmsg_copy_hdr(req, iomsg);
+ return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
+ msg.msg_controllen);
}
int io_recvmsg_prep_async(struct io_kiocb *req)
{
+ struct io_async_msghdr *iomsg;
int ret;
if (!io_msg_alloc_async_prep(req))
return -ENOMEM;
- ret = io_recvmsg_copy_hdr(req, req->async_data);
+ iomsg = req->async_data;
+ ret = io_recvmsg_copy_hdr(req, iomsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
@@ -582,6 +626,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ sr->done_io = 0;
+
if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
@@ -618,7 +664,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- sr->done_io = 0;
sr->nr_multishot_loops = 0;
return 0;
}
@@ -627,6 +672,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ req->flags &= ~REQ_F_BL_EMPTY;
sr->done_io = 0;
sr->len = 0; /* get from the provided buffer */
req->buf_index = sr->buf_group;
@@ -645,30 +691,22 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
unsigned int cflags;
cflags = io_put_kbuf(req, issue_flags);
- if (msg->msg_inq && msg->msg_inq != -1)
+ if (msg->msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
- if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
- io_req_set_res(req, *ret, cflags);
- *ret = IOU_OK;
- return true;
- }
-
- if (mshot_finished)
- goto finish;
-
/*
* Fill CQE for this receive and see if we should keep trying to
* receive from this socket.
*/
- if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
+ io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
*ret, cflags | IORING_CQE_F_MORE)) {
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
io_recv_prep_retry(req);
/* Known not-empty or unknown state, retry */
- if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
+ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) {
if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
return false;
/* mshot retries exceeded, force a requeue */
@@ -681,8 +719,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
*ret = -EAGAIN;
return true;
}
- /* Otherwise stop multishot but use the current result. */
-finish:
+
+ /* Finish the request / stop multishot. */
io_req_set_res(req, *ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
@@ -803,8 +841,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return io_setup_async_msg(req, kmsg, issue_flags);
- if (!io_check_multishot(req, issue_flags))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ flags = sr->msg_flags;
+ if (force_nonblock)
+ flags |= MSG_DONTWAIT;
retry_multishot:
if (io_do_buffer_select(req)) {
@@ -826,10 +865,6 @@ retry_multishot:
iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
}
- flags = sr->msg_flags;
- if (force_nonblock)
- flags |= MSG_DONTWAIT;
-
kmsg->msg.msg_get_inq = 1;
kmsg->msg.msg_inq = -1;
if (req->flags & REQ_F_APOLL_MULTISHOT) {
@@ -855,7 +890,7 @@ retry_multishot:
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_BL_NO_RECYCLE;
return io_setup_async_msg(req, kmsg, issue_flags);
}
if (ret == -ERESTARTSYS)
@@ -875,13 +910,10 @@ retry_multishot:
if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
goto retry_multishot;
- if (mshot_finished) {
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- io_netmsg_recycle(req, issue_flags);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- }
+ if (mshot_finished)
+ io_req_msg_cleanup(req, kmsg, issue_flags);
+ else if (ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg, issue_flags);
return ret;
}
@@ -900,9 +932,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
- if (!io_check_multishot(req, issue_flags))
- return -EAGAIN;
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -915,6 +944,10 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_iocb = NULL;
msg.msg_ubuf = NULL;
+ flags = sr->msg_flags;
+ if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
retry_multishot:
if (io_do_buffer_select(req)) {
void __user *buf;
@@ -933,9 +966,6 @@ retry_multishot:
msg.msg_inq = -1;
msg.msg_flags = 0;
- flags = sr->msg_flags;
- if (force_nonblock)
- flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
@@ -953,7 +983,7 @@ retry_multishot:
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_BL_NO_RECYCLE;
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
@@ -1003,6 +1033,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *notif;
+ zc->done_io = 0;
+
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
@@ -1055,8 +1087,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
- zc->done_io = 0;
-
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
zc->msg_flags |= MSG_CMSG_COMPAT;
@@ -1196,7 +1226,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
zc->len -= ret;
zc->buf += ret;
zc->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_BL_NO_RECYCLE;
return io_setup_async_addr(req, &__address, issue_flags);
}
if (ret == -ERESTARTSYS)
@@ -1266,7 +1296,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_BL_NO_RECYCLE;
return io_setup_async_msg(req, kmsg, issue_flags);
}
if (ret == -ERESTARTSYS)
@@ -1301,7 +1331,7 @@ void io_sendrecv_fail(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- if (req->flags & REQ_F_PARTIAL_IO)
+ if (sr->done_io)
req->cqe.res = sr->done_io;
if ((req->flags & REQ_F_NEED_CLEANUP) &&
@@ -1351,8 +1381,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
- if (!io_check_multishot(req, issue_flags))
- return -EAGAIN;
retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index b1ee3a9c3807..9c080aadc5a6 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -35,6 +35,7 @@
#include "rw.h"
#include "waitid.h"
#include "futex.h"
+#include "truncate.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -474,6 +475,12 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_install_fixed_fd_prep,
.issue = io_install_fixed_fd,
},
+ [IORING_OP_FTRUNCATE] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .prep = io_ftruncate_prep,
+ .issue = io_ftruncate,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -712,6 +719,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FIXED_FD_INSTALL] = {
.name = "FIXED_FD_INSTALL",
},
+ [IORING_OP_FTRUNCATE] = {
+ .name = "FTRUNCATE",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 7513afc7b702..5f779139cae1 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -15,6 +15,7 @@
#include "io_uring.h"
#include "refs.h"
+#include "napi.h"
#include "opdef.h"
#include "kbuf.h"
#include "poll.h"
@@ -343,8 +344,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
- } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
- IO_POLL_REF_MASK);
+ v &= IO_POLL_REF_MASK;
+ } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK);
return IOU_POLL_NO_ACTION;
}
@@ -539,14 +540,6 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
poll->wait.private = (void *) wqe_private;
if (poll->events & EPOLLEXCLUSIVE) {
- /*
- * Exclusive waits may only wake a limited amount of entries
- * rather than all of them, this may interfere with lazy
- * wake if someone does wait(events > 1). Ensure we don't do
- * lazy wake for those, as we need to process each one as they
- * come in.
- */
- req->flags |= REQ_F_POLL_NO_LAZY;
add_wait_queue_exclusive(head, &poll->wait);
} else {
add_wait_queue(head, &poll->wait);
@@ -588,10 +581,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
struct io_poll_table *ipt, __poll_t mask,
unsigned issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
-
INIT_HLIST_NODE(&req->hash_node);
- req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask);
poll->file = req->file;
req->apoll_events = poll->events;
@@ -618,6 +608,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (issue_flags & IO_URING_F_UNLOCKED)
req->flags &= ~REQ_F_HASH_LOCKED;
+
+ /*
+ * Exclusive waits may only wake a limited amount of entries
+ * rather than all of them, this may interfere with lazy
+ * wake if someone does wait(events > 1). Ensure we don't do
+ * lazy wake for those, as we need to process each one as they
+ * come in.
+ */
+ if (poll->events & EPOLLEXCLUSIVE)
+ req->flags |= REQ_F_POLL_NO_LAZY;
+
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
if (unlikely(ipt->error || !ipt->nr_entries)) {
@@ -652,6 +653,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
__io_poll_execute(req, mask);
return 0;
}
+ io_napi_add(req);
if (ipt->owning) {
/*
@@ -727,7 +729,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
- if (!file_can_poll(req->file))
+ if (!io_file_can_poll(req))
return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
@@ -818,9 +820,8 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
if (poll_only && req->opcode != IORING_OP_POLL_ADD)
continue;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
- if (cd->seq == req->work.cancel_seq)
+ if (io_cancel_match_sequence(req, cd->seq))
continue;
- req->work.cancel_seq = cd->seq;
}
*out_bucket = hb;
return req;
diff --git a/io_uring/register.c b/io_uring/register.c
index 5e62c1208996..99c37775f974 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -26,6 +26,7 @@
#include "register.h"
#include "cancel.h"
#include "kbuf.h"
+#include "napi.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -550,6 +551,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_pbuf_status(ctx, arg);
break;
+ case IORING_REGISTER_NAPI:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_napi(ctx, arg);
+ break;
+ case IORING_UNREGISTER_NAPI:
+ ret = -EINVAL;
+ if (nr_args != 1)
+ break;
+ ret = io_unregister_napi(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c6f199bbee28..e21000238954 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -2,8 +2,6 @@
#ifndef IOU_RSRC_H
#define IOU_RSRC_H
-#include <net/af_unix.h>
-
#include "alloc_cache.h"
#define IO_NODE_ALLOC_CACHE_MAX 32
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d5e79d9bdc71..47e097ab5d7e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -11,6 +11,7 @@
#include <linux/nospec.h>
#include <linux/compat.h>
#include <linux/io_uring/cmd.h>
+#include <linux/indirect_call_wrapper.h>
#include <uapi/linux/io_uring.h>
@@ -274,7 +275,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
* current cycle.
*/
io_req_io_end(req);
- req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
return true;
}
req_set_fail(req);
@@ -341,7 +342,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
io_req_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+ req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
return;
}
req->cqe.res = res;
@@ -682,7 +683,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
* just use poll if we can, and don't attempt if the fs doesn't
* support callback based unlocks
*/
- if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+ if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC))
return false;
wait->wait.func = io_async_buf_func;
@@ -721,7 +722,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
struct file *file = req->file;
int ret;
- if (unlikely(!file || !(file->f_mode & mode)))
+ if (unlikely(!(file->f_mode & mode)))
return -EBADF;
if (!(req->flags & REQ_F_FIXED_FILE))
@@ -831,7 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* If we can poll, just do that. For a vectored read, we'll
* need to copy state first.
*/
- if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
+ if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -930,7 +931,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
/*
* Multishot MUST be used on a pollable file
*/
- if (!file_can_poll(req->file))
+ if (!io_file_can_poll(req))
return -EBADFD;
ret = __io_read(req, issue_flags);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 65b5dbe3c850..363052b4ea76 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -15,9 +15,11 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "napi.h"
#include "sqpoll.h"
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
+#define IORING_TW_CAP_ENTRIES_VALUE 8
enum {
IO_SQ_THREAD_SHOULD_STOP = 0,
@@ -193,6 +195,9 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
+ if (io_napi(ctx))
+ ret += io_napi_sqpoll_busy_poll(ctx);
+
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
@@ -219,10 +224,52 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}
+/*
+ * Run task_work, processing the retry_list first. The retry_list holds
+ * entries that we passed on in the previous run, if we had more task_work
+ * than we were asked to process. Newly queued task_work isn't run until the
+ * retry list has been fully processed.
+ */
+static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ unsigned int count = 0;
+
+ if (*retry_list) {
+ *retry_list = io_handle_tw_list(*retry_list, &count, max_entries);
+ if (count >= max_entries)
+ return count;
+ max_entries -= count;
+ }
+
+ *retry_list = tctx_task_work_run(tctx, max_entries, &count);
+ return count;
+}
+
+static bool io_sq_tw_pending(struct llist_node *retry_list)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ return retry_list || !llist_empty(&tctx->task_list);
+}
+
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
+{
+ struct rusage end;
+
+ getrusage(current, RUSAGE_SELF, &end);
+ end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
+ end.ru_stime.tv_usec -= start->ru_stime.tv_usec;
+
+ sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
+}
+
static int io_sq_thread(void *data)
{
+ struct llist_node *retry_list = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
+ struct rusage start;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
@@ -251,18 +298,21 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
+ getrusage(current, RUSAGE_SELF, &start);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
- if (io_run_task_work())
+ if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
sqt_spin = true;
if (sqt_spin || !time_after(jiffies, timeout)) {
- if (sqt_spin)
+ if (sqt_spin) {
+ io_sq_update_worktime(sqd, &start);
timeout = jiffies + sqd->sq_thread_idle;
+ }
if (unlikely(need_resched())) {
mutex_unlock(&sqd->lock);
cond_resched();
@@ -273,7 +323,7 @@ static int io_sq_thread(void *data)
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
+ if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) {
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -312,6 +362,9 @@ static int io_sq_thread(void *data)
timeout = jiffies + sqd->sq_thread_idle;
}
+ if (retry_list)
+ io_sq_tw(&retry_list, UINT_MAX);
+
io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index 8df37e8c9149..4171666b1cf4 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -16,6 +16,7 @@ struct io_sq_data {
pid_t task_pid;
pid_t task_tgid;
+ u64 work_time;
unsigned long state;
struct completion exited;
};
diff --git a/io_uring/truncate.c b/io_uring/truncate.c
new file mode 100644
index 000000000000..62ee73d34d72
--- /dev/null
+++ b/io_uring/truncate.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../fs/internal.h"
+
+#include "io_uring.h"
+#include "truncate.h"
+
+struct io_ftrunc {
+ struct file *file;
+ loff_t len;
+};
+
+int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc);
+
+ if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index ||
+ sqe->splice_fd_in || sqe->addr3)
+ return -EINVAL;
+
+ ft->len = READ_ONCE(sqe->off);
+
+ req->flags |= REQ_F_FORCE_ASYNC;
+ return 0;
+}
+
+int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc);
+ int ret;
+
+ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
+
+ ret = do_ftruncate(req->file, ft->len, 1);
+
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/truncate.h b/io_uring/truncate.h
new file mode 100644
index 000000000000..ec088293a478
--- /dev/null
+++ b/io_uring/truncate.h
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index c33fca585dde..42f63adfa54a 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -5,6 +5,7 @@
#include <linux/io_uring/cmd.h>
#include <linux/security.h>
#include <linux/nospec.h>
+#include <net/sock.h>
#include <uapi/linux/io_uring.h>
#include <asm/ioctls.h>
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index e1c810e0b85a..44905b82eea8 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -112,7 +112,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
- ret = do_getxattr(mnt_idmap(req->file->f_path.mnt),
+ ret = do_getxattr(file_mnt_idmap(req->file),
req->file->f_path.dentry,
&ix->ctx);