summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 11:35:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 11:35:31 -0700
commitd2c84bdce25a678c1e1f116d65b58790bd241af0 (patch)
tree45499e5ded0bec5bc0ac7e305ee19198374a02c4 /include
parent0f1a876682f0979d6a1e5f86861dd562d1758936 (diff)
parent606559dc4fa36a954a51fbf1c6c0cc320f551fe0 (diff)
Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Make running of task_work internal loops more fair, and unify how the different methods deal with them (me) - Support for per-ring NAPI. The two minor networking patches are in a shared branch with netdev (Stefan) - Add support for truncate (Tony) - Export SQPOLL utilization stats (Xiaobing) - Multishot fixes (Pavel) - Fix for a race in manipulating the request flags via poll (Pavel) - Cleanup the multishot checking by making it generic, moving it out of opcode handlers (Pavel) - Various tweaks and cleanups (me, Kunwu, Alexander) * tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits) io_uring: Fix sqpoll utilization check racing with dying sqpoll io_uring/net: dedup io_recv_finish req completion io_uring: refactor DEFER_TASKRUN multishot checks io_uring: fix mshot io-wq checks io_uring/net: add io_req_msg_cleanup() helper io_uring/net: simplify msghd->msg_inq checking io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io io_uring/net: correctly handle multishot recvmsg retry setup io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler io_uring: fix io_queue_proc modifying req->flags io_uring: fix mshot read defer taskrun cqe posting io_uring/net: fix overflow check in io_recvmsg_mshot_prep() io_uring/net: correct the type of variable io_uring/sqpoll: statistics of the true utilization of sq threads io_uring/net: move recv/recvmsg flags out of retry loop io_uring/kbuf: flag request if buffer pool is empty after buffer pick io_uring/net: improve the usercopy for sendmsg/recvmsg io_uring/net: move receive multishot out of the generic msghdr path io_uring/net: unify how recvmsg and sendmsg copy in the msghdr ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/io_uring_types.h137
-rw-r--r--include/net/busy_poll.h4
-rw-r--r--include/trace/events/io_uring.h30
-rw-r--r--include/uapi/linux/io_uring.h13
4 files changed, 114 insertions, 70 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 854ad67a5f70..e24893625085 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -2,6 +2,7 @@
#define IO_URING_TYPES_H
#include <linux/blkdev.h>
+#include <linux/hashtable.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <linux/llist.h>
@@ -240,12 +241,14 @@ struct io_ring_ctx {
unsigned int poll_activated: 1;
unsigned int drain_disabled: 1;
unsigned int compat: 1;
+ unsigned int iowq_limits_set : 1;
struct task_struct *submitter_task;
struct io_rings *rings;
struct percpu_ref refs;
enum task_work_notify_mode notify_method;
+ unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -274,10 +277,20 @@ struct io_ring_ctx {
*/
struct io_rsrc_node *rsrc_node;
atomic_t cancel_seq;
+
+ /*
+ * ->iopoll_list is protected by the ctx->uring_lock for
+ * io_uring instances that don't use IORING_SETUP_SQPOLL.
+ * For SQPOLL, only the single threaded io_sq_thread() will
+ * manipulate the list, hence no extra locking is needed there.
+ */
+ bool poll_multi_queue;
+ struct io_wq_work_list iopoll_list;
+
struct io_file_table file_table;
+ struct io_mapped_ubuf **user_bufs;
unsigned nr_user_files;
unsigned nr_user_bufs;
- struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
@@ -289,15 +302,6 @@ struct io_ring_ctx {
struct io_alloc_cache netmsg_cache;
/*
- * ->iopoll_list is protected by the ctx->uring_lock for
- * io_uring instances that don't use IORING_SETUP_SQPOLL.
- * For SQPOLL, only the single threaded io_sq_thread() will
- * manipulate the list, hence no extra locking is needed there.
- */
- struct io_wq_work_list iopoll_list;
- bool poll_multi_queue;
-
- /*
* Any cancelable uring_cmd is added to this list in
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
*/
@@ -343,8 +347,8 @@ struct io_ring_ctx {
spinlock_t completion_lock;
/* IRQ completion list, under ->completion_lock */
- struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
+ struct io_wq_work_list locked_free_list;
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
@@ -366,9 +370,6 @@ struct io_ring_ctx {
unsigned int file_alloc_start;
unsigned int file_alloc_end;
- struct xarray personalities;
- u32 pers_next;
-
struct list_head io_buffers_cache;
/* deferred free list, protected by ->uring_lock */
@@ -389,6 +390,9 @@ struct io_ring_ctx {
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;
+ u32 pers_next;
+ struct xarray personalities;
+
/* hashed buffered write serialization */
struct io_wq_hash *hash_map;
@@ -405,11 +409,22 @@ struct io_ring_ctx {
/* io-wq management, e.g. thread count */
u32 iowq_limits[2];
- bool iowq_limits_set;
struct callback_head poll_wq_task_work;
struct list_head defer_list;
- unsigned sq_thread_idle;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ struct list_head napi_list; /* track busy poll napi_id */
+ spinlock_t napi_lock; /* napi_list lock */
+
+ /* napi busy poll default timeout */
+ unsigned int napi_busy_poll_to;
+ bool napi_prefer_busy_poll;
+ bool napi_enabled;
+
+ DECLARE_HASHTABLE(napi_ht, 4);
+#endif
+
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
@@ -455,7 +470,6 @@ enum {
REQ_F_SKIP_LINK_CQES_BIT,
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
- REQ_F_PARTIAL_IO_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT,
@@ -463,75 +477,88 @@ enum {
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
REQ_F_POLL_NO_LAZY_BIT,
+ REQ_F_CANCEL_SEQ_BIT,
+ REQ_F_CAN_POLL_BIT,
+ REQ_F_BL_EMPTY_BIT,
+ REQ_F_BL_NO_RECYCLE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
+typedef u64 __bitwise io_req_flags_t;
+#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno)))
+
enum {
/* ctx owns file */
- REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
+ REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
- REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
+ REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
- REQ_F_LINK = BIT(REQ_F_LINK_BIT),
+ REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
- REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
+ REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
- REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+ REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
- REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT),
/* IOSQE_CQE_SKIP_SUCCESS */
- REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
+ REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT),
/* fail rest of links */
- REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
+ REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT),
/* on inflight list, should be cancelled and waited on exit reliably */
- REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
+ REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
- REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
+ REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
- REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
+ REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
- REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
+ REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT),
/* needs cleanup */
- REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
+ REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
- REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
+ REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT),
/* buffer already selected */
- REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
/* buffer selected from ring, needs commit */
- REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
+ REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT),
/* caller should reissue async */
- REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
+ REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT),
/* supports async reads/writes */
- REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
+ REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
- REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT),
/* has creds assigned */
- REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
+ REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT),
/* skip refcounting if not set */
- REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
+ REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
- REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT),
/* ->async_data allocated */
- REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
+ REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
- REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT),
/* single poll may be active */
- REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
/* double poll may active */
- REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
- /* request has already done partial IO */
- REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
+ REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
/* fast poll multishot mode */
- REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
+ REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
/* recvmsg special flag, clear EPOLLIN */
- REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
+ REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
- REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
+ REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
/* don't use lazy poll wake for this request */
- REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT),
+ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
+ /* cancel sequence is set and valid */
+ REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
+ /* file is pollable */
+ REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
+ /* buffer list was empty after selection of buffer */
+ REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
+ /* don't recycle provided buffers for this request */
+ REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -592,15 +619,17 @@ struct io_kiocb {
* and after selection it points to the buffer ID itself.
*/
u16 buf_index;
- unsigned int flags;
+
+ unsigned nr_tw;
+
+ /* REQ_F_* flags */
+ io_req_flags_t flags;
struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
- struct io_rsrc_node *rsrc_node;
-
union {
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
@@ -621,10 +650,12 @@ struct io_kiocb {
/* cache ->apoll->events */
__poll_t apoll_events;
};
+
+ struct io_rsrc_node *rsrc_node;
+
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
- unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
/* internal polling, see IORING_FEAT_FAST_POLL */
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 4dabeb6c76d3..9b09acac538e 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
+void napi_busy_loop_rcu(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget);
+
#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 69454f1f98b0..e948df7ce625 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( void *, req )
__field( u64, user_data )
__field( u8, opcode )
- __field( unsigned int, flags )
+ __field( unsigned long long, flags )
__field( struct io_wq_work *, work )
__field( int, rw )
@@ -159,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work,
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = req->cqe.user_data;
- __entry->flags = req->flags;
+ __entry->flags = (__force unsigned long long) req->flags;
__entry->opcode = req->opcode;
__entry->work = &req->work;
__entry->rw = rw;
@@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work,
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
- TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
+ TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data,
- __get_str(op_str),
- __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
+ __get_str(op_str), __entry->flags,
+ __entry->rw ? "hashed" : "normal", __entry->work)
);
/**
@@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req,
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
- __field( u32, flags )
+ __field( unsigned long long, flags )
__field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(req->opcode) )
@@ -389,16 +389,16 @@ TRACE_EVENT(io_uring_submit_req,
__entry->req = req;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
- __entry->flags = req->flags;
+ __entry->flags = (__force unsigned long long) req->flags;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
- TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
+ TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, "
"sq_thread %d", __entry->ctx, __entry->req,
- __entry->user_data, __get_str(op_str),
- __entry->flags, __entry->sq_thread)
+ __entry->user_data, __get_str(op_str), __entry->flags,
+ __entry->sq_thread)
);
/*
@@ -602,29 +602,25 @@ TRACE_EVENT(io_uring_cqe_overflow,
*
* @tctx: pointer to a io_uring_task
* @count: how many functions it ran
- * @loops: how many loops it ran
*
*/
TRACE_EVENT(io_uring_task_work_run,
- TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+ TP_PROTO(void *tctx, unsigned int count),
- TP_ARGS(tctx, count, loops),
+ TP_ARGS(tctx, count),
TP_STRUCT__entry (
__field( void *, tctx )
__field( unsigned int, count )
- __field( unsigned int, loops )
),
TP_fast_assign(
__entry->tctx = tctx;
__entry->count = count;
- __entry->loops = loops;
),
- TP_printk("tctx %p, count %u, loops %u",
- __entry->tctx, __entry->count, __entry->loops)
+ TP_printk("tctx %p, count %u", __entry->tctx, __entry->count)
);
TRACE_EVENT(io_uring_short_write,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7a673b52827b..7bd10201a02b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -255,6 +255,7 @@ enum io_uring_op {
IORING_OP_FUTEX_WAKE,
IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL,
+ IORING_OP_FTRUNCATE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -570,6 +571,10 @@ enum {
/* return status information for a buffer group */
IORING_REGISTER_PBUF_STATUS = 26,
+ /* set/clear busy poll settings */
+ IORING_REGISTER_NAPI = 27,
+ IORING_UNREGISTER_NAPI = 28,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -703,6 +708,14 @@ struct io_uring_buf_status {
__u32 resv[8];
};
+/* argument for IORING_(UN)REGISTER_NAPI */
+struct io_uring_napi {
+ __u32 busy_poll_to;
+ __u8 prefer_busy_poll;
+ __u8 pad[3];
+ __u64 resv;
+};
+
/*
* io_uring_restriction->opcode values
*/