diff options
Diffstat (limited to 'io_uring')
56 files changed, 6086 insertions, 3989 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index 2cdc51825405..d695b60dba4f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,12 +2,18 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o +ifdef CONFIG_GCOV_PROFILE_URING +GCOV_PROFILE := y +endif + +obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ + tctx.o filetable.o rw.o net.o poll.o \ + eventfd.o uring_cmd.o openclose.o \ + sqpoll.o xattr.o nop.o fs.o splice.o \ + sync.o msg_ring.o advise.o openclose.o \ + epoll.o statx.o timeout.o fdinfo.o \ + cancel.o waitid.o register.o \ + truncate.o memmap.o alloc_cache.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/advise.c b/io_uring/advise.c index 7085804c513c..cb7b881665e5 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -17,14 +17,14 @@ struct io_fadvise { struct file *file; u64 offset; - u32 len; + u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; - u32 len; + u64 len; u32 advice; }; @@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); + ma->len = READ_ONCE(sqe->off); + if (!ma->len) + ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; @@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); + fa->len = READ_ONCE(sqe->addr); + if (!fa->len) + fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; diff --git a/io_uring/alloc_cache.c b/io_uring/alloc_cache.c new file mode 100644 index 000000000000..58423888b736 --- /dev/null +++ b/io_uring/alloc_cache.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "alloc_cache.h" + +void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(const void *)) +{ + void *entry; + + if (!cache->entries) + return; + + while ((entry = io_alloc_cache_get(cache)) != NULL) + free(entry); + + kvfree(cache->entries); + cache->entries = NULL; +} + +/* returns false if the cache was initialized properly */ +bool io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, unsigned int size, + unsigned int init_bytes) +{ + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (!cache->entries) + return true; + + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + cache->init_clear = init_bytes; + return false; +} + +void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp) +{ + void *obj; + + obj = kmalloc(cache->elem_size, gfp); + if (obj && cache->init_clear) + memset(obj, 0, cache->init_clear); + return obj; +} diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index bf2fb26a6539..0dd17d8ba93a 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,66 +1,71 @@ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H +#include <linux/io_uring_types.h> + /* * Don't allow the cache to grow beyond this size. */ -#define IO_ALLOC_CACHE_MAX 512 +#define IO_ALLOC_CACHE_MAX 128 + +void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(const void *)); +bool io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, unsigned int size, + unsigned int init_bytes); -struct io_cache_entry { - struct io_wq_work_node node; -}; +void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); + +static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) +{ + if (IS_ENABLED(CONFIG_KASAN)) { + kfree(*iov); + *iov = NULL; + *nr = 0; + } +} static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, - struct io_cache_entry *entry) + void *entry) { if (cache->nr_cached < cache->max_cached) { - cache->nr_cached++; - wq_stack_add_head(&entry->node, &cache->list); - kasan_mempool_poison_object(entry); + if (!kasan_mempool_poison_object(entry)) + return false; + cache->entries[cache->nr_cached++] = entry; return true; } return false; } -static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) +static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { - return !cache->list.next; -} + if (cache->nr_cached) { + void *entry = cache->entries[--cache->nr_cached]; -static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) -{ - if (cache->list.next) { - struct io_cache_entry *entry; - - entry = container_of(cache->list.next, struct io_cache_entry, node); + /* + * If KASAN is enabled, always clear the initial bytes that + * must be zeroed post alloc, in case any of them overlap + * with KASAN storage. + */ +#if defined(CONFIG_KASAN) kasan_mempool_unpoison_object(entry, cache->elem_size); - cache->list.next = cache->list.next->next; - cache->nr_cached--; + if (cache->init_clear) + memset(entry, 0, cache->init_clear); +#endif return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, - unsigned max_nr, size_t size) +static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) { - cache->list.next = NULL; - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; -} + void *obj; -static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(struct io_cache_entry *)) -{ - while (1) { - struct io_cache_entry *entry = io_alloc_cache_get(cache); - - if (!entry) - break; - free(entry); - } - cache->nr_cached = 0; + obj = io_alloc_cache_get(cache); + if (obj) + return obj; + return io_cache_alloc_new(cache, gfp); } + #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 8a8b07dfc444..484193567839 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -58,9 +58,8 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) return false; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { check_seq: - if (cd->seq == req->work.cancel_seq) + if (io_cancel_match_sequence(req, cd->seq)) return false; - req->work.cancel_seq = cd->seq; } return true; @@ -185,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); + ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; @@ -208,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { @@ -235,16 +232,6 @@ done: return IOU_OK; } -void init_hash_table(struct io_hash_table *table, unsigned size) -{ - unsigned int i; - - for (i = 0; i < size; i++) { - spin_lock_init(&table->hbs[i].lock); - INIT_HLIST_HEAD(&table->hbs[i].list); - } -} - static int __io_sync_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int fd) { @@ -253,10 +240,12 @@ static int __io_sync_cancel(struct io_uring_task *tctx, /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { - if (unlikely(fd >= ctx->nr_user_files)) + struct io_rsrc_node *node; + + node = io_rsrc_node_lookup(&ctx->file_table.data, fd); + if (unlikely(!node)) return -EBADF; - fd = array_index_nospec(fd, ctx->nr_user_files); - cd->file = io_file_from_index(&ctx->file_table, fd); + cd->file = io_slot_file(node); if (!cd->file) return -EBADF; } diff --git a/io_uring/cancel.h b/io_uring/cancel.h index c0a8e7c520b6..bbfea2cd00ea 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -20,9 +20,18 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); -void init_hash_table(struct io_hash_table *table, unsigned size); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) +{ + if (req->cancel_seq_set && sequence == req->work.cancel_seq) + return true; + + req->cancel_seq_set = true; + req->work.cancel_seq = sequence; + return false; +} + #endif diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c new file mode 100644 index 000000000000..100d5da94cb9 --- /dev/null +++ b/io_uring/eventfd.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/eventfd.h> +#include <linux/eventpoll.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> + +#include "io-wq.h" +#include "eventfd.h" + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async; + /* protected by ->completion_lock */ + unsigned last_cq_tail; + refcount_t refs; + atomic_t ops; + struct rcu_head rcu; +}; + +enum { + IO_EVENTFD_OP_SIGNAL_BIT, +}; + +static void io_eventfd_free(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + +static void io_eventfd_put(struct io_ev_fd *ev_fd) +{ + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + +static void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + io_eventfd_put(ev_fd); +} + +static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) +{ + if (put_ref) + io_eventfd_put(ev_fd); + rcu_read_unlock(); +} + +/* + * Returns true if the caller should put the ev_fd reference, false if not. + */ +static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) +{ + if (eventfd_signal_allowed()) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + return true; + } + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return false; + } + return true; +} + +/* + * Trigger if eventfd_async isn't set, or if it's set and the caller is + * an async worker. If ev_fd isn't valid, obviously return false. + */ +static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) +{ + if (ev_fd) + return !ev_fd->eventfd_async || io_wq_current_is_worker(); + return false; +} + +/* + * On success, returns with an ev_fd reference grabbed and the RCU read + * lock held. + */ +static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return NULL; + + rcu_read_lock(); + + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists in case an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) + return ev_fd; + + rcu_read_unlock(); + return NULL; +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = io_eventfd_grab(ctx); + if (ev_fd) + io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); +} + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = io_eventfd_grab(ctx); + if (ev_fd) { + bool skip, put_ref = true; + + /* + * Eventfd should only get triggered when at least one event + * has been posted. Some applications rely on the eventfd + * notification count only changing IFF a new CQE has been + * added to the CQ ring. There's no dependency on 1:1 + * relationship between how many times this function is called + * (and hence the eventfd count) and number of CQEs posted to + * the CQ ring. + */ + spin_lock(&ctx->completion_lock); + skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + if (!skip) + put_ref = __io_eventfd_signal(ev_fd); + + io_eventfd_release(ev_fd, put_ref); + } +} + +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ev_fd->last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + refcount_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + io_eventfd_put(ev_fd); + return 0; + } + + return -ENXIO; +} diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h new file mode 100644 index 000000000000..d394f49c6321 --- /dev/null +++ b/io_uring/eventfd.h @@ -0,0 +1,8 @@ + +struct io_ring_ctx; +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async); +int io_eventfd_unregister(struct io_ring_ctx *ctx); + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 976e9500f651..f60d0a9d505e 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -46,15 +46,56 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m, + const char *tracking_strategy) +{ + seq_puts(m, "NAPI:\tenabled\n"); + seq_printf(m, "napi tracking:\t%s\n", tracking_strategy); + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); + if (ctx->napi_prefer_busy_poll) + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); + else + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); +} + +static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ + unsigned int mode = READ_ONCE(ctx->napi_track_mode); + + switch (mode) { + case IO_URING_NAPI_TRACKING_INACTIVE: + seq_puts(m, "NAPI:\tdisabled\n"); + break; + case IO_URING_NAPI_TRACKING_DYNAMIC: + common_tracking_show_fdinfo(ctx, m, "dynamic"); + break; + case IO_URING_NAPI_TRACKING_STATIC: + common_tracking_show_fdinfo(ctx, m, "static"); + break; + default: + seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode); + } +} +#else +static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ +} +#endif + /* * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) { - struct io_ring_ctx *ctx = f->private_data; + struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; + struct rusage sq_usage; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); unsigned int sq_tail = READ_ONCE(r->sq.tail); @@ -64,6 +105,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; + u64 sq_total_time = 0, sq_work_time = 0; bool has_lock; unsigned int i; @@ -145,27 +187,46 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { struct io_sq_data *sq = ctx->sq_data; - sq_pid = sq->task_pid; - sq_cpu = sq->sq_cpu; + /* + * sq->thread might be NULL if we raced with the sqpoll + * thread termination. + */ + if (sq->thread) { + sq_pid = sq->task_pid; + sq_cpu = sq->sq_cpu; + getrusage(sq->thread, RUSAGE_SELF, &sq_usage); + sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 + + sq_usage.ru_stime.tv_usec); + sq_work_time = sq->work_time; + } } seq_printf(m, "SqThread:\t%d\n", sq_pid); seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); - seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = io_file_from_index(&ctx->file_table, i); + seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); + seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); + seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); + for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + struct file *f = NULL; - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: <none>\n", i); + if (ctx->file_table.data.nodes[i]) + f = io_slot_file(ctx->file_table.data.nodes[i]); + if (f) { + seq_printf(m, "%5u: ", i); + seq_file_path(m, f, " \t\n\\"); + seq_puts(m, "\n"); + } } - seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; + seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); + for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + struct io_mapped_ubuf *buf = NULL; - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); + if (ctx->buf_table.nodes[i]) + buf = ctx->buf_table.nodes[i]->buf; + if (buf) + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); + else + seq_printf(m, "%5u: <none>\n", i); } if (has_lock && !xa_empty(&ctx->personalities)) { unsigned long index; @@ -177,22 +238,13 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) } seq_puts(m, "PollList:\n"); - for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; - struct io_hash_bucket *hbl = &ctx->cancel_table_locked.hbs[i]; struct io_kiocb *req; - spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); - spin_unlock(&hb->lock); - - if (!has_lock) - continue; - hlist_for_each_entry(req, &hbl->list, hash_node) - seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); + task_work_pending(req->tctx->task)); } if (has_lock) @@ -207,7 +259,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) cqe->user_data, cqe->res, cqe->flags); } - spin_unlock(&ctx->completion_lock); + napi_show_fdinfo(ctx, m); } #endif diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 6e86e6188dbe..dd8eeec97acf 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -36,27 +36,22 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) return -ENFILE; } -bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, + unsigned nr_files) { - table->files = kvcalloc(nr_files, sizeof(table->files[0]), - GFP_KERNEL_ACCOUNT); - if (unlikely(!table->files)) + if (io_rsrc_data_alloc(&table->data, nr_files)) return false; - table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); - if (unlikely(!table->bitmap)) { - kvfree(table->files); - return false; - } - - return true; + if (table->bitmap) + return true; + io_rsrc_data_free(ctx, &table->data); + return false; } -void io_free_file_tables(struct io_file_table *table) +void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table) { - kvfree(table->files); + io_rsrc_data_free(ctx, &table->data); bitmap_free(table->bitmap); - table->files = NULL; table->bitmap = NULL; } @@ -64,32 +59,24 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, u32 slot_index) __must_hold(&req->ctx->uring_lock) { - struct io_fixed_file *file_slot; - int ret; + struct io_rsrc_node *node; if (io_is_uring_fops(file)) return -EBADF; - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - if (slot_index >= ctx->nr_user_files) + if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + node = io_rsrc_node_alloc(IORING_RSRC_FILE); + if (!node) + return -ENOMEM; - if (file_slot->file_ptr) { - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - io_slot_file(file_slot)); - if (ret) - return ret; + if (!io_reset_rsrc_node(ctx, &ctx->file_table.data, slot_index)) + io_file_bitmap_set(&ctx->file_table, slot_index); - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - } - - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); + ctx->file_table.data.nodes[slot_index] = node; + io_fixed_file_set(node, file); return 0; } @@ -134,25 +121,17 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) { - struct io_fixed_file *file_slot; - int ret; + struct io_rsrc_node *node; - if (unlikely(!ctx->file_data)) + if (unlikely(!ctx->file_table.data.nr)) return -ENXIO; - if (offset >= ctx->nr_user_files) + if (offset >= ctx->file_table.data.nr) return -EINVAL; - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - if (!file_slot->file_ptr) + node = io_rsrc_node_lookup(&ctx->file_table.data, offset); + if (!node) return -EBADF; - - ret = io_queue_rsrc_removal(ctx->file_data, offset, - io_slot_file(file_slot)); - if (ret) - return ret; - - file_slot->file_ptr = 0; + io_reset_rsrc_node(ctx, &ctx->file_table.data, offset); io_file_bitmap_clear(&ctx->file_table, offset); return 0; } @@ -167,7 +146,7 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx, return -EFAULT; if (check_add_overflow(range.off, range.len, &end)) return -EOVERFLOW; - if (range.resv || end > ctx->nr_user_files) + if (range.resv || end > ctx->file_table.data.nr) return -EINVAL; io_file_table_set_alloc_range(ctx, range.off, range.len); diff --git a/io_uring/filetable.h b/io_uring/filetable.h index b47adf170c31..7717ea9efd0e 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -4,9 +4,10 @@ #include <linux/file.h> #include <linux/io_uring_types.h> +#include "rsrc.h" -bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); -void io_free_file_tables(struct io_file_table *table); +bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, unsigned nr_files); +void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); @@ -17,7 +18,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); -unsigned int io_file_get_flags(struct file *file); +io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { @@ -33,50 +34,34 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit) table->alloc_hint = bit + 1; } -static inline struct io_fixed_file * -io_fixed_file_slot(struct io_file_table *table, unsigned i) -{ - return &table->files[i]; -} - #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) -static inline unsigned int io_slot_flags(struct io_fixed_file *slot) +static inline unsigned int io_slot_flags(struct io_rsrc_node *node) { - return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; -} -static inline struct file *io_slot_file(struct io_fixed_file *slot) -{ - return (struct file *)(slot->file_ptr & FFS_MASK); + return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; } -static inline struct file *io_file_from_index(struct io_file_table *table, - int index) +static inline struct file *io_slot_file(struct io_rsrc_node *node) { - return io_slot_file(io_fixed_file_slot(table, index)); + return (struct file *)(node->file_ptr & FFS_MASK); } -static inline void io_fixed_file_set(struct io_fixed_file *file_slot, +static inline void io_fixed_file_set(struct io_rsrc_node *node, struct file *file) { - file_slot->file_ptr = (unsigned long)file | + node->file_ptr = (unsigned long)file | (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } -static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) -{ - ctx->file_table.alloc_hint = ctx->file_alloc_start; -} - static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, unsigned off, unsigned len) { ctx->file_alloc_start = off; ctx->file_alloc_end = off + len; - io_reset_alloc_hint(ctx); + ctx->file_table.alloc_hint = ctx->file_alloc_start; } #endif diff --git a/io_uring/futex.c b/io_uring/futex.c index 3c3575303c3d..43e2143255f5 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -9,7 +9,7 @@ #include "../kernel/futex/futex.h" #include "io_uring.h" -#include "rsrc.h" +#include "alloc_cache.h" #include "futex.h" struct io_futex { @@ -27,27 +27,21 @@ struct io_futex { }; struct io_futex_data { - union { - struct futex_q q; - struct io_cache_entry cache; - }; + struct futex_q q; struct io_kiocb *req; }; -void io_futex_cache_init(struct io_ring_ctx *ctx) -{ - io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); -} +#define IO_FUTEX_ALLOC_CACHE_MAX 32 -static void io_futex_cache_entry_free(struct io_cache_entry *entry) +bool io_futex_cache_init(struct io_ring_ctx *ctx) { - kfree(container_of(entry, struct io_futex_data, cache)); + return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, + sizeof(struct io_futex_data), 0); } void io_futex_cache_free(struct io_ring_ctx *ctx) { - io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); + io_alloc_cache_free(&ctx->futex_cache, kfree); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) @@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) + if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); __io_futex_complete(req, ts); } @@ -147,7 +141,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, return -ENOENT; } -bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; @@ -157,8 +151,9 @@ bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (!io_match_task_safe(req, task, cancel_all)) + if (!io_match_task_safe(req, tctx, cancel_all)) continue; + hlist_del_init(&req->hash_node); __io_futex_cancel(ctx, req); found = true; } @@ -256,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) io_req_task_work_add(req); } -static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) -{ - struct io_cache_entry *entry; - - entry = io_alloc_cache_get(&ctx->futex_cache); - if (entry) - return container_of(entry, struct io_futex_data, cache); - - return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); -} - int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); @@ -336,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_lock(ctx, issue_flags); - ifd = io_alloc_ifd(ctx); + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT); if (!ifd) { ret = -ENOMEM; goto done_unlock; @@ -354,7 +338,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); - futex_queue(&ifd->q, hb); + futex_queue(&ifd->q, hb, NULL); return IOU_ISSUE_SKIP_COMPLETE; } diff --git a/io_uring/futex.h b/io_uring/futex.h index 0847e9e8a127..d789fcf715e3 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -11,9 +11,9 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags); #if defined(CONFIG_FUTEX) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); -bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_futex_cache_init(struct io_ring_ctx *ctx); +bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); #else static inline int io_futex_cancel(struct io_ring_ctx *ctx, @@ -23,12 +23,13 @@ static inline int io_futex_cancel(struct io_ring_ctx *ctx, return 0; } static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) + struct io_uring_task *tctx, bool cancel_all) { return false; } -static inline void io_futex_cache_init(struct io_ring_ctx *ctx) +static inline bool io_futex_cache_init(struct io_ring_ctx *ctx) { + return false; } static inline void io_futex_cache_free(struct io_ring_ctx *ctx) { diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 522196dfb0ff..91019b4d0308 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/rculist_nulls.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/task_work.h> #include <linux/audit.h> #include <linux/mmu_context.h> @@ -23,12 +24,13 @@ #include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) +#define WORKER_INIT_LIMIT 3 enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ + IO_WORKER_F_UP = 0, /* up and active */ + IO_WORKER_F_RUNNING = 1, /* account as running */ + IO_WORKER_F_FREE = 2, /* worker on free list */ + IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -44,25 +46,25 @@ enum { */ struct io_worker { refcount_t ref; - unsigned flags; + int create_index; + unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; struct io_wq_work *cur_work; - struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; unsigned long create_state; struct callback_head create_work; - int create_index; + int init_retries; union { struct rcu_head rcu; - struct work_struct work; + struct delayed_work work; }; }; @@ -160,12 +162,12 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, struct io_wq_work *work) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); } static void io_worker_ref_put(struct io_wq *wq) @@ -225,7 +227,7 @@ static void io_worker_exit(struct io_worker *worker) wait_for_completion(&worker->ref_done); raw_spin_lock(&wq->lock); - if (worker->flags & IO_WORKER_F_FREE) + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); raw_spin_unlock(&wq->lock); @@ -410,7 +412,7 @@ static void io_wq_dec_running(struct io_worker *worker) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; if (!atomic_dec_and_test(&acct->nr_running)) @@ -430,8 +432,8 @@ static void io_wq_dec_running(struct io_worker *worker) */ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { + clear_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); raw_spin_unlock(&wq->lock); @@ -444,15 +446,15 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) __must_hold(wq->lock) { - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { + set_bit(IO_WORKER_F_FREE, &worker->flags); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return work->flags >> IO_WQ_HASH_SHIFT; + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -539,7 +541,6 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; - worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -564,10 +565,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * clear the stalled flag. */ work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wq, worker); - /* * Make sure cancelation can find this, even before * it becomes the active work. That avoids a window @@ -576,11 +574,17 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * current work item for this worker. */ raw_spin_lock(&worker->lock); - worker->next_work = work; + worker->cur_work = work; raw_spin_unlock(&worker->lock); - } else { - break; } + + raw_spin_unlock(&acct->lock); + + if (!work) + break; + + __io_worker_busy(wq, worker); + io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -591,8 +595,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; + if (do_kill && + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -629,9 +634,10 @@ static int io_wq_worker(void *data) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + set_mask_bits(&worker->flags, 0, + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); @@ -695,11 +701,11 @@ void io_wq_worker_running(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (worker->flags & IO_WORKER_F_RUNNING) + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags |= IO_WORKER_F_RUNNING; + set_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_inc_running(worker); } @@ -713,12 +719,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags &= ~IO_WORKER_F_RUNNING; + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_dec_running(worker); } @@ -732,7 +738,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, raw_spin_lock(&wq->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); list_add_tail_rcu(&worker->all_list, &wq->all_list); - worker->flags |= IO_WORKER_F_FREE; + set_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -742,7 +748,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) return true; } -static inline bool io_should_retry_thread(long err) +static inline bool io_should_retry_thread(struct io_worker *worker, long err) { /* * Prevent perpetual task_work retry, if the task (or its group) is @@ -750,6 +756,8 @@ static inline bool io_should_retry_thread(long err) */ if (fatal_signal_pending(current)) return false; + if (worker->init_retries++ >= WORKER_INIT_LIMIT) + return false; switch (err) { case -EAGAIN: @@ -762,6 +770,18 @@ static inline bool io_should_retry_thread(long err) } } +static void queue_create_worker_retry(struct io_worker *worker) +{ + /* + * We only bother retrying because there's a chance that the + * failure to create a worker is due to some temporary condition + * in the forking task (e.g. outstanding signal); give the task + * some time to clear that condition. + */ + schedule_delayed_work(&worker->work, + msecs_to_jiffies(worker->init_retries * 5)); +} + static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; @@ -776,7 +796,7 @@ static void create_worker_cont(struct callback_head *cb) io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); @@ -801,12 +821,13 @@ static void create_worker_cont(struct callback_head *cb) /* re-create attempts grab a new worker ref, drop the existing one */ io_worker_release(worker); - schedule_work(&worker->work); + queue_create_worker_retry(worker); } static void io_workqueue_create(struct work_struct *work) { - struct io_worker *worker = container_of(work, struct io_worker, work); + struct io_worker *worker = container_of(work, struct io_worker, + work.work); struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) @@ -838,17 +859,17 @@ fail: init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; + set_bit(IO_WORKER_F_BOUND, &worker->flags); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wq, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); + INIT_DELAYED_WORK(&worker->work, io_workqueue_create); + queue_create_worker_retry(worker); } return true; @@ -889,7 +910,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); work = wq->free_work(work); } while (work); @@ -924,8 +945,12 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - struct io_cb_cancel_data match; - unsigned work_flags = work->flags; + unsigned int work_flags = atomic_read(&work->flags); + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; bool do_create; /* @@ -933,7 +958,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } @@ -963,10 +988,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - io_acct_cancel_pending_work(wq, acct, &match); } } @@ -980,7 +1001,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -988,7 +1009,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } @@ -1005,8 +1026,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) match->nr_running++; raw_spin_unlock(&worker->lock); @@ -1161,7 +1181,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) goto err; - cpumask_copy(wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(data->task, wq->cpu_mask); wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); @@ -1316,17 +1336,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { + cpumask_var_t allowed_mask; + int ret = 0; + if (!tctx || !tctx->io_wq) return -EINVAL; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + return -ENOMEM; + rcu_read_lock(); - if (mask) - cpumask_copy(tctx->io_wq->cpu_mask, mask); - else - cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask); + if (mask) { + if (cpumask_subset(mask, allowed_mask)) + cpumask_copy(tctx->io_wq->cpu_mask, mask); + else + ret = -EINVAL; + } else { + cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask); + } rcu_read_unlock(); - return 0; + free_cpumask_var(allowed_mask); + return ret; } /* diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 2b2a6406dd8e..b3b004a7b625 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return work->flags & IO_WQ_WORK_HASHED; + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cd9a137ad6ce..f7acae5f7e1d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -51,7 +51,6 @@ #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> @@ -59,12 +58,10 @@ #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> -#include <net/af_unix.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> -#include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> @@ -72,6 +69,7 @@ #include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/jump_label.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS @@ -95,14 +93,16 @@ #include "notif.h" #include "waitid.h" #include "futex.h" +#include "napi.h" +#include "uring_cmd.h" +#include "msg_ring.h" +#include "memmap.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" - -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) +#include "eventfd.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -115,17 +115,13 @@ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 - -enum { - IO_CHECK_CQ_OVERFLOW_BIT, - IO_CHECK_CQ_DROPPED_BIT, -}; +#define IO_LOCAL_TW_DEFAULT_MAX 20 struct io_defer_entry { struct list_head list; @@ -146,18 +142,22 @@ struct io_defer_entry { #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all); + struct io_uring_task *tctx, + bool cancel_all, + bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); + struct kmem_cache *req_cachep; +static struct workqueue_struct *iou_wq __ro_after_init; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL -static struct ctl_table kernel_io_uring_disabled_table[] = { +static const struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, @@ -174,17 +174,9 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {}, }; #endif -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) - __io_submit_flush_completions(ctx); -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -210,12 +202,12 @@ static bool io_match_linked(struct io_kiocb *head) * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) { bool matched; - if (task && head->task != task) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; @@ -224,9 +216,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -257,14 +249,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - struct io_tw_state ts = { .locked = true, }; + struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); - if (WARN_ON_ONCE(!ts.locked)) - return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -272,15 +262,23 @@ static __cold void io_fallback_req_func(struct work_struct *work) static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { - unsigned hash_buckets = 1U << bits; - size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + unsigned int hash_buckets; + int i; - table->hbs = kmalloc(hash_size, GFP_KERNEL); - if (!table->hbs) - return -ENOMEM; + do { + hash_buckets = 1U << bits; + table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), + GFP_KERNEL_ACCOUNT); + if (table->hbs) + break; + if (bits == 1) + return -ENOMEM; + bits--; + } while (1); table->hash_bits = bits; - init_hash_table(table, hash_buckets); + for (i = 0; i < hash_buckets; i++) + INIT_HLIST_HEAD(&table->hbs[i].list); return 0; } @@ -288,6 +286,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; + bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -304,44 +303,48 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; - if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) - goto err; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; + ctx->hybrid_poll_time = LLONG_MAX; atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); - io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_msghdr)); - io_futex_cache_init(ctx); + ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, + sizeof(struct async_poll), 0); + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_msghdr), + offsetof(struct io_async_msghdr, clear)); + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_rw), + offsetof(struct io_async_rw, clear)); + ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_uring_cmd_data), 0); + spin_lock_init(&ctx->msg_lock); + ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_kiocb), 0); + ret |= io_futex_cache_init(ctx); + if (ret) + goto free_ref; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); - init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - INIT_LIST_HEAD(&ctx->rsrc_ref_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); @@ -349,11 +352,21 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); + io_napi_init(ctx); + mutex_init(&ctx->mmap_lock); + return ctx; + +free_ref: + percpu_ref_exit(&ctx->refs); err: - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, kfree); + io_futex_cache_free(ctx); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; @@ -382,7 +395,7 @@ static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); + io_kbuf_drop(req); spin_unlock(&req->ctx->completion_lock); } @@ -397,11 +410,8 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } + if (req->flags & REQ_F_INFLIGHT) + atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -415,7 +425,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - atomic_inc(&req->task->io_uring->inflight_tracked); + atomic_inc(&req->tctx->inflight_tracked); } } @@ -462,10 +472,9 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); @@ -475,13 +484,13 @@ static void io_prep_async_work(struct io_kiocb *req) /* don't serialize this request if the fs doesn't need it */ if (should_hash && (req->file->f_flags & O_DIRECT) && - (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) + (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -492,23 +501,27 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) +static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); @@ -520,8 +533,8 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) * procedure rather than attempt to run this request (or create a new * worker for it). */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + if (WARN_ON_ONCE(!same_thread_group(tctx->task, current))) + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); @@ -529,8 +542,20 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) io_queue_linked_timeout(link); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +{ + io_queue_iowq(req); +} + +void io_req_queue_iowq(struct io_kiocb *req) { + req->io_task_work.func = io_req_queue_iowq_tw; + io_req_task_work_add(req); +} + +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) +{ + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -541,84 +566,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) io_req_task_queue(de->req); kfree(de); } -} - -void io_eventfd_ops(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd = NULL; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); - } - -out: - rcu_read_unlock(); -} - -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); - if (skip) - return; - - io_eventfd_signal(ctx); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -627,11 +575,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); + if (ctx->drain_active) io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } @@ -670,29 +615,14 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) io_commit_cqring_flush(ctx); } -/* Returns true if there are no backlogged entries after the flush */ -static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) -{ - struct io_overflow_cqe *ocqe; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->cq_overflow_list, &list); - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - spin_unlock(&ctx->completion_lock); - - while (!list_empty(&list)) { - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); - list_del(&ocqe->list); - kfree(ocqe); - } -} - -static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { size_t cqe_size = sizeof(struct io_uring_cqe); - if (__io_cqring_events(ctx) == ctx->cq_entries) + lockdep_assert_held(&ctx->uring_lock); + + /* don't abort if we're dying, entries must get freed */ + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) @@ -703,13 +633,31 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!io_get_cqe_overflow(ctx, &cqe, true)) - break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - memcpy(cqe, &ocqe->cqe, cqe_size); + + if (!dying) { + if (!io_get_cqe_overflow(ctx, &cqe, true)) + break; + memcpy(cqe, &ocqe->cqe, cqe_size); + } list_del(&ocqe->list); kfree(ocqe); + + /* + * For silly syzbot cases that deliberately overflow by huge + * amounts, check if we need to resched and drop and + * reacquire the locks if so. Nothing real would ever hit this. + * Ideally we'd have a non-posting unlock for this, but hard + * to care for a non-real case. + */ + if (need_resched()) { + io_cq_unlock_post(ctx); + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + io_cq_lock(ctx); + } } if (list_empty(&ctx->cq_overflow_list)) { @@ -719,46 +667,32 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) -{ - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); -} - -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) -{ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - io_cqring_do_overflow_flush(ctx); -} - -/* can be called by any task */ -static void io_put_task_remote(struct task_struct *task) +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, 1); - if (unlikely(atomic_read(&tctx->in_cancel))) - wake_up(&tctx->wait); - put_task_struct(task); + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); } -/* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task) +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { - task->io_uring->cached_refs++; + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx, false); + mutex_unlock(&ctx->uring_lock); } /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task) +static inline void io_put_task(struct io_kiocb *req) { - if (likely(task == current)) - io_put_task_local(task); - else - io_put_task_remote(task); + struct io_uring_task *tctx = req->tctx; + + if (likely(tctx->task == current)) { + tctx->cached_refs++; + } else { + percpu_counter_sub(&tctx->inflight, 1); + if (unlikely(atomic_read(&tctx->in_cancel))) + wake_up(&tctx->wait); + put_task_struct(tctx->task); + } } void io_task_refs_refill(struct io_uring_task *tctx) @@ -822,7 +756,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -void io_req_cqe_overflow(struct io_kiocb *req) +static void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, @@ -880,8 +814,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, * the ring. */ if (likely(io_get_cqe(ctx, &cqe))) { - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); @@ -890,154 +822,101 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } + + trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; } -static void __io_flush_post_cqes(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) +static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { - struct io_submit_state *state = &ctx->submit_state; - unsigned int i; + bool filled; - lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + if (!filled) + filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - } - } - } - state->cqes_count = 0; + return filled; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled && allow_overflow) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - + filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +/* + * Must be called from inline task_work so we now a flush will happen later, + * and obviously with ctx->uring_lock held (tw always has that). + */ +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + spin_unlock(&ctx->completion_lock); + } + ctx->submit_state.cq_flush = true; } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - u64 user_data = req->cqe.user_data; - struct io_uring_cqe *cqe; - - if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, false); + bool posted; + lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { - __io_cq_lock(ctx); - __io_flush_post_cqes(ctx); - /* no need to flush - flush is deferred */ - __io_cq_unlock_post(ctx); - } - - /* For defered completions this is not as strict as it is otherwise, - * however it's main job is to prevent unbounded posted completions, - * and in that it works just as well. - */ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - return false; - - cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; - cqe->user_data = user_data; - cqe->res = res; - cqe->flags = cflags; - return true; + __io_cq_lock(ctx); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + ctx->submit_state.cq_flush = true; + __io_cq_unlock_post(ctx); + return posted; } -static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) +static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *rsrc_node = NULL; + + /* + * All execution paths but io-wq use the deferred completions by + * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. + */ + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) + return; + + /* + * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires + * the submitter task context, IOPOLL protects with uring_lock. + */ + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return; + } io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) { if (!io_fill_cqe_req(ctx, req)) io_req_cqe_overflow(req); } + io_cq_unlock_post(ctx); /* - * If we're the last reference to this request, add to our locked - * free_list cache. + * We don't free the request here because we know it's called from + * io-wq only, which holds a reference, so it cannot be the last put. */ - if (req_ref_put_and_test(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_put_kbuf_comp(req); - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - io_put_file(req); - - rsrc_node = req->rsrc_node; - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_task_remote(req->task); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - } - io_cq_unlock_post(ctx); - - if (rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); - io_put_rsrc_node(ctx, rsrc_node); - io_ring_submit_unlock(ctx, issue_flags); - } -} - -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) -{ - if (req->ctx->task_complete && req->ctx->submitter_task != current) { - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(req->ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req, issue_flags); - } else { - struct io_ring_ctx *ctx = req->ctx; - - mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); - mutex_unlock(&ctx->uring_lock); - } + req_ref_put(req); } void io_req_defer_failed(struct io_kiocb *req, s32 res) @@ -1048,7 +927,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -1061,6 +940,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) { req->ctx = ctx; + req->buf_node = NULL; + req->file_node = NULL; req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ @@ -1068,15 +949,6 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. @@ -1088,18 +960,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } + int ret; ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -1115,8 +976,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; + while (ret--) { + struct io_kiocb *req = reqs[ret]; io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); @@ -1166,83 +1027,53 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ts->locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - ts->locked = false; - } + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } -static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, - struct io_tw_state *ts, - struct llist_node *last) +/* + * Run queued task_work, returning the number of entries processed in *count. + * If more entries than max_entries are available, stop processing once this + * is reached and return the rest of the list. + */ +struct llist_node *io_handle_tw_list(struct llist_node *node, + unsigned int *count, + unsigned int max_entries) { - unsigned int count = 0; + struct io_ring_ctx *ctx = NULL; + struct io_tw_state ts = { }; - while (node && node != last) { + do { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, ts); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - ts->locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); + if (req->ctx != ctx) { + ctx_flush_and_put(ctx, &ts); + ctx = req->ctx; + mutex_lock(&ctx->uring_lock); + percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + req, &ts); node = next; - count++; + (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(*ctx, ts); - *ctx = NULL; + ctx_flush_and_put(ctx, &ts); + ctx = NULL; cond_resched(); } - } - - return count; -} + } while (node && *count < max_entries); -/** - * io_llist_xchg - swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @new: new entry as the head of the list - * - * If list is empty, return NULL, otherwise, return the pointer to the first entry. - * The order of entries returned is from the newest to the oldest added one. - */ -static inline struct llist_node *io_llist_xchg(struct llist_head *head, - struct llist_node *new) -{ - return xchg(&head->first, new); -} - -/** - * io_llist_cmpxchg - possibly swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @old: expected old value of the first entry of the list - * @new: new entry as the head of the list - * - * perform a cmpxchg on the first entry of the list. - */ - -static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, - struct llist_node *old, - struct llist_node *new) -{ - return cmpxchg(&head->first, old, new); + ctx_flush_and_put(ctx, &ts); + return node; } -static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) { - struct llist_node *node = llist_del_all(&tctx->task_list); struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; @@ -1268,50 +1099,54 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) } } -void tctx_task_work(struct callback_head *cb) +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); +} + +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, + unsigned int max_entries, + unsigned int *count) { - struct io_tw_state ts = {}; - struct io_ring_ctx *ctx = NULL; - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, - task_work); - struct llist_node fake = {}; struct llist_node *node; - unsigned int loops = 0; - unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx, true); - return; + return NULL; } - do { - loops++; - node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &ts, &fake); - - /* skip expensive cmpxchg if there are items in the list */ - if (READ_ONCE(tctx->task_list.first) != &fake) - continue; - if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { - io_submit_flush_completions(ctx); - if (READ_ONCE(tctx->task_list.first) != &fake) - continue; - } - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - } while (node != &fake); - - ctx_flush_and_put(ctx, &ts); + node = llist_del_all(&tctx->task_list); + if (node) { + node = llist_reverse_order(node); + node = io_handle_tw_list(node, count, max_entries); + } /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); - trace_io_uring_task_work_run(tctx, count, loops); + trace_io_uring_task_work_run(tctx, *count); + return node; } -static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +void tctx_task_work(struct callback_head *cb) +{ + struct io_uring_task *tctx; + struct llist_node *ret; + unsigned int count = 0; + + tctx = container_of(cb, struct io_uring_task, task_work); + ret = tctx_task_work_run(tctx, UINT_MAX, &count); + /* can't happen */ + WARN_ON_ONCE(ret); +} + +static inline void io_req_local_work_add(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned flags) { - struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1325,6 +1160,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; + guard(rcu)(); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; @@ -1379,7 +1216,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) static void io_req_normal_work_add(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ @@ -1389,7 +1226,13 @@ static void io_req_normal_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) + /* SQPOLL doesn't need the task_work added, it'll run it itself */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + __set_notify_signal(tctx->task); + return; + } + + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); @@ -1397,30 +1240,64 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - } else { + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, req->ctx, flags); + else io_req_normal_work_add(req); - } +} + +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags) +{ + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + io_req_local_work_add(req, ctx, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { - struct llist_node *node; + struct llist_node *node = llist_del_all(&ctx->work_llist); - node = llist_del_all(&ctx->work_llist); - while (node) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); + __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); +} - node = node->next; - io_req_normal_work_add(req); +static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, + int min_events) +{ + if (!io_local_work_pending(ctx)) + return false; + if (events < min_events) + return true; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + return false; +} + +static int __io_run_local_work_loop(struct llist_node **node, + struct io_tw_state *ts, + int events) +{ + int ret = 0; + + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); + *node = next; + if (++ret >= events) + break; } + + return ret; } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, + int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; @@ -1431,60 +1308,51 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ - node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); + ret += __io_run_local_work_loop(&node, ts, max_events - ret); + ctx->retry_llist.first = node; loops++; - if (!llist_empty(&ctx->work_llist)) + if (io_run_local_work_continue(ctx, ret, min_events)) goto again; - if (ts->locked) { - io_submit_flush_completions(ctx); - if (!llist_empty(&ctx->work_llist)) - goto again; - } +retry_done: + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; + trace_io_uring_local_work_run(ctx, ret, loops); return ret; } -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, + int min_events) { - struct io_tw_state ts = { .locked = true, }; - int ret; + struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; - - ret = __io_run_local_work(ctx, &ts); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!ts.locked)) - mutex_lock(&ctx->uring_lock); - return ret; + return __io_run_local_work(ctx, &ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, + int max_events) { struct io_tw_state ts = {}; int ret; - ts.locked = mutex_trylock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts); - if (ts.locked) - mutex_unlock(&ctx->uring_lock); - + mutex_lock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, &ts, min_events, max_events); + mutex_unlock(&ctx->uring_lock); return ret; } @@ -1497,11 +1365,10 @@ static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, ts); + io_queue_iowq(req); else io_queue_sqe(req); } @@ -1536,6 +1403,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1546,7 +1419,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) + if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) kfree(apoll); req->flags &= ~REQ_F_POLLED; } @@ -1556,10 +1429,9 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_clean_op(req); } io_put_file(req); + io_req_put_rsrc_nodes(req); + io_put_task(req); - io_req_put_rsrc_locked(req, ctx); - - io_put_task(req->task); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); @@ -1572,14 +1444,16 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node; __io_cq_lock(ctx); - /* must come first to preserve CQE ordering in failure cases */ - if (state->cqes_count) - __io_flush_post_cqes(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); @@ -1592,10 +1466,11 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + ctx->submit_state.cq_flush = false; } static unsigned io_cqring_events(struct io_ring_ctx *ctx) @@ -1638,13 +1513,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) unsigned int nr_events = 0; unsigned long check_cq; + lockdep_assert_held(&ctx->uring_lock); + if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx, false); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. @@ -1677,7 +1554,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx); + (void) io_run_local_work_locked(ctx, min); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1707,10 +1584,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (ts->locked) - io_req_complete_defer(req); - else - io_req_complete_post(req, IO_URING_F_UNLOCKED); + io_req_complete_defer(req); } /* @@ -1768,9 +1642,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -unsigned int io_file_get_flags(struct file *file) +io_req_flags_t io_file_get_flags(struct file *file) { - unsigned int res = 0; + io_req_flags_t res = 0; if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; @@ -1779,36 +1653,6 @@ unsigned int io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); - req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - -int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!cdef->prep_async) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (!def->manual_alloc) { - if (io_alloc_async_data(req)) - return -EAGAIN; - } - return cdef->prep_async(req); -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -1955,21 +1799,44 @@ void io_wq_submit_work(struct io_wq_work *work) io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } + /* + * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the + * submitter task context. Final request completions are handed to the + * right context, however this is not the case of auxiliary CQEs, + * which is the main mean of operation for multishot requests. + * Don't allow any multishot execution from io-wq. It's more restrictive + * than necessary and also cleaner. + */ + if (req->flags & REQ_F_APOLL_MULTISHOT) { + err = -EBADFD; + if (!io_file_can_poll(req)) + goto fail; + if (req->file->f_flags & O_NONBLOCK || + req->file->f_mode & FMODE_NOWAIT) { + err = -ECANCELED; + if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) + goto fail; + return; + } else { + req->flags &= ~REQ_F_APOLL_MULTISHOT; + } + } + if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; - if (opcode_poll && file_can_poll(req->file)) { + if (opcode_poll && io_file_can_poll(req)) { needs_poll = true; issue_flags |= IO_URING_F_NONBLOCK; } @@ -2009,7 +1876,7 @@ fail: } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret < 0) + if (ret) io_req_task_queue_fail(req, ret); } @@ -2017,20 +1884,16 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *slot; + struct io_rsrc_node *node; struct file *file = NULL; io_ring_submit_lock(ctx, issue_flags); - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - goto out; - fd = array_index_nospec(fd, ctx->nr_user_files); - slot = io_fixed_file_slot(&ctx->file_table, fd); - if (!req->rsrc_node) - __io_req_set_rsrc_node(req, ctx); - req->flags |= io_slot_flags(slot); - file = io_slot_file(slot); -out: + node = io_rsrc_node_lookup(&ctx->file_table.data, fd); + if (node) { + io_req_assign_rsrc_node(&req->file_node, node); + req->flags |= io_slot_flags(node); + file = io_slot_file(node); + } io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -2066,7 +1929,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) break; case IO_APOLL_ABORTED: io_kbuf_recycle(req, 0); - io_queue_iowq(req, NULL); + io_queue_iowq(req); break; case IO_APOLL_OK: break; @@ -2103,17 +1966,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) { - io_req_defer_failed(req, ret); - return; - } - if (unlikely(req->ctx->drain_active)) io_drain_req(req); else - io_queue_iowq(req, NULL); + io_queue_iowq(req); } } @@ -2159,6 +2015,13 @@ static void io_init_req_drain(struct io_kiocb *req) } } +static __cold int io_init_fail_req(struct io_kiocb *req, int err) +{ + /* ensure per-opcode data is cleared if we fail before prep */ + memset(&req->cmd.data, 0, sizeof(req->cmd.data)); + return err; +} + static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) @@ -2171,37 +2034,40 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); + sqe_flags = READ_ONCE(sqe->flags); + req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->rsrc_node = NULL; - req->task = current; + req->tctx = current->io_uring; + req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; - return -EINVAL; + return io_init_fail_req(req, -EINVAL); } + opcode = array_index_nospec(opcode, IORING_OP_LAST); + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (sqe_flags & IOSQE_BUFFER_SELECT) { if (!def->buffer_select) - return -EOPNOTSUPP; + return io_init_fail_req(req, -EOPNOTSUPP); req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) - return -EOPNOTSUPP; + return io_init_fail_req(req, -EOPNOTSUPP); io_init_req_drain(req); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; + return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) req->flags |= REQ_F_FORCE_ASYNC; @@ -2214,9 +2080,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (!def->ioprio && sqe->ioprio) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; @@ -2240,12 +2106,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); get_cred(req->creds); ret = security_uring_override_creds(req->creds); if (ret) { put_cred(req->creds); - return ret; + return io_init_fail_req(req, ret); } req->flags |= REQ_F_CREDS; } @@ -2311,11 +2177,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - - trace_io_uring_link(req, link->head); + trace_io_uring_link(req, link->last); link->last->link = req; link->last = req; @@ -2396,7 +2258,8 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { + if (static_branch_unlikely(&io_key_has_sqarray) && + (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { /* drop invalid entries */ @@ -2407,6 +2270,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } + head = array_index_nospec(head, ctx->sq_entries); } /* @@ -2475,33 +2339,6 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -struct io_wait_queue { - struct wait_queue_entry wq; - struct io_ring_ctx *ctx; - unsigned cq_tail; - unsigned nr_timeouts; - ktime_t timeout; -}; - -static inline bool io_has_work(struct io_ring_ctx *ctx) -{ - return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); -} - -static inline bool io_should_wake(struct io_wait_queue *iowq) -{ - struct io_ring_ctx *ctx = iowq->ctx; - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; - - /* - * Wake up if we have enough events, or if a timeout occurred since we - * started waiting. For timeouts, we always want to return to userspace, - * regardless of event count. - */ - return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; -} - static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { @@ -2518,9 +2355,9 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx) > 0) + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2539,98 +2376,206 @@ static bool current_pending_io(void) return percpu_counter_read_positive(&tctx->inflight); } -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq) +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { - int io_wait, ret; + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - if (unlikely(READ_ONCE(ctx->check_cq))) - return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) - return 1; - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) - return 1; - if (unlikely(task_sigpending(current))) - return -EINTR; - if (unlikely(io_should_wake(iowq))) - return 0; + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; + } + + iowq->t.function = io_cqring_timer_wakeup; + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } else { + timeout = iowq->timeout; + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + int ret = 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - io_wait = current->in_iowait; if (current_pending_io()) current->in_iowait = 1; - ret = 0; - if (iowq->timeout == KTIME_MAX) + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else schedule(); - else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) - ret = -ETIME; - current->in_iowait = io_wait; + current->in_iowait = 0; return ret; } +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(io_local_work_pending(ctx))) + return 1; + if (unlikely(task_work_pending(current))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + + return __io_cqring_wait_schedule(ctx, iowq, start_time); +} + +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; +}; + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; + ktime_t start_time; int ret; if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); - io_cqring_overflow_flush(ctx); - /* if user messes with these they will just get an early return */ + + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); if (__io_cqring_events_user(ctx) >= min_events) return 0; - if (sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); - else -#endif - ret = set_user_sigmask(sig, sigsz); - - if (ret) - return ret; - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); - if (uts) { - struct timespec64 ts; + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); + if (!(flags & IORING_ENTER_ABS_TIMER)) + iowq.timeout = ktime_add(iowq.timeout, start_time); + } - if (get_timespec64(&ts, uts)) - return -EFAULT; - iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + if (ext_arg->sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); + else +#endif + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); + + if (ret) + return ret; } + io_napi_busy_loop(ctx, &iowq); + trace_io_uring_cqring_wait(ctx, min_events); do { unsigned long check_cq; + int nr_wait; - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings->cq.tail); + else + nr_wait = 1; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { @@ -2638,7 +2583,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq); + ret = io_cqring_wait_schedule(ctx, &iowq, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -2647,9 +2592,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx); /* * Non-local task_work will be run on exit to userspace, but @@ -2688,131 +2633,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -void io_mem_free(void *ptr) -{ - if (!ptr) - return; - - folio_put(virt_to_folio(ptr)); -} - -static void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array; - int i; - - if (!pages) - return; - - page_array = *pages; - if (!page_array) - return; - - for (i = 0; i < npages; i++) - unpin_user_page(page_array[i]); - kvfree(page_array); - *pages = NULL; -} - -static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - int ret, i; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > USHRT_MAX) - return ERR_PTR(-EINVAL); - page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!page_array) - return ERR_PTR(-ENOMEM); - - ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - page_array); - if (ret != nr_pages) { -err: - io_pages_free(&page_array, ret > 0 ? ret : 0); - return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); - } - - page_addr = page_address(page_array[0]); - for (i = 0; i < nr_pages; i++) { - ret = -EINVAL; - - /* - * Can't support mapping user allocated ring memory on 32-bit - * archs where it could potentially reside in highmem. Just - * fail those with -EINVAL, just like we did on kernels that - * didn't support this feature. - */ - if (PageHighMem(page_array[i])) - goto err; - - /* - * No support for discontig pages for now, should either be a - * single normal page, or a huge page. Later on we can add - * support for remapping discontig pages, for now we will - * just fail them with EINVAL. - */ - if (page_address(page_array[i]) != page_addr) - goto err; - page_addr += PAGE_SIZE; - } - - *pages = page_array; - *npages = nr_pages; - return page_to_virt(page_array[0]); -} - -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); - ctx->rings = NULL; - ctx->sq_sqes = NULL; - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; - } -} - -void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - void *ret; - - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret) - return ret; - return ERR_PTR(-ENOMEM); + io_free_region(ctx, &ctx->sq_region); + io_free_region(ctx, &ctx->ring_region); + ctx->rings = NULL; + ctx->sq_sqes = NULL; } -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -2820,7 +2650,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { + if (flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } @@ -2831,14 +2661,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif - if (ctx->flags & IORING_SETUP_NO_SQARRAY) { - if (sq_offset) - *sq_offset = SIZE_MAX; + if (flags & IORING_SETUP_NO_SQARRAY) { + *sq_offset = SIZE_MAX; return off; } - if (sq_offset) - *sq_offset = off; + *sq_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) @@ -2856,7 +2684,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) int nr = 0; mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); @@ -2868,58 +2695,47 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_rsrc_node_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct io_rsrc_node, cache)); -} - static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) - return; mutex_lock(&ctx->uring_lock); - if (ctx->buf_data) - __io_sqe_buffers_unregister(ctx); - if (ctx->file_data) - __io_sqe_files_unregister(ctx); + io_sqe_buffers_unregister(ctx); + io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); + io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); + io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); - /* there are no registered resources left, nobody uses it */ - if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx, ctx->rsrc_node); - - WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); - io_kbuf_mmap_list_free(ctx); + + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_dec(&io_key_has_sqarray); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); + io_napi_free(ctx); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -2970,13 +2786,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); - - poll_wait(file, &ctx->poll_wq, wait); /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring + * provides mb() which pairs with barrier from wq_has_sleeper + * call in io_commit_cqring */ - smp_rmb(); + poll_wait(file, &ctx->poll_wq, wait); + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; @@ -3055,7 +2870,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -3134,17 +2950,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); - if (ctx->rings) - io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* - * If we failed setting up the ctx, we might not have any rings - * and therefore did not submit any requests - */ - if (ctx->rings) - io_kill_timeouts(ctx, NULL, true); - flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); @@ -3154,7 +2961,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * noise and overhead, there's no discernable change in runtime * over using system_wq. */ - queue_work(system_unbound_wq, &ctx->exit_work); + queue_work(iou_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -3167,7 +2974,7 @@ static int io_uring_release(struct inode *inode, struct file *file) } struct io_task_cancel { - struct task_struct *task; + struct io_uring_task *tctx; bool all; }; @@ -3176,11 +2983,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; - return io_match_task_safe(req, cancel->task, cancel->all); + return io_match_task_safe(req, cancel->tctx, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all) { struct io_defer_entry *de; @@ -3188,7 +2995,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -3230,43 +3037,12 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool ret = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, - hash_node) { - struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, - struct io_uring_cmd); - struct file *file = req->file; - - if (!cancel_all && req->task != task) - continue; - - if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; - file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); - ret = true; - } - } - io_submit_flush_completions(ctx); - - return ret; -} - static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) + struct io_uring_task *tctx, + bool cancel_all, + bool is_sqpoll_thread) { - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; bool ret = false; @@ -3280,9 +3056,9 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if (!ctx->rings) return false; - if (!task) { + if (!tctx) { ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { + } else if (tctx->io_wq) { /* * Cancels requests of all rings, not only @ctx, but * it's fine as the task is in exit/exec. @@ -3294,7 +3070,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { + is_sqpoll_thread) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; @@ -3304,17 +3080,19 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx) > 0; - ret |= io_cancel_defer_files(ctx, task, cancel_all); + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - ret |= io_waitid_remove_all(ctx, task, cancel_all); - ret |= io_futex_remove_all(ctx, task, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + if (tctx) ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); return ret; } @@ -3350,8 +3128,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) bool loop = false; io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); + inflight = tctx_inflight(tctx, false); if (!inflight) break; @@ -3361,13 +3142,16 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (node->ctx->sq_data) continue; loop |= io_uring_try_cancel_requests(node->ctx, - current, cancel_all); + current->io_uring, + cancel_all, + false); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, - current, - cancel_all); + current->io_uring, + cancel_all, + true); } if (loop) { @@ -3379,7 +3163,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { + if (io_local_work_pending(node->ctx)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; @@ -3410,156 +3194,49 @@ end_wait: void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) +static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, + const struct io_uring_getevents_arg __user *uarg) { - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - - switch (offset & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - ptr = ctx->rings; - break; - case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - ptr = ctx->sq_sqes; - break; - case IORING_OFF_PBUF_RING: { - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - rcu_read_lock(); - ptr = io_pbuf_get_address(ctx, bgid); - rcu_read_unlock(); - if (!ptr) - return ERR_PTR(-EINVAL); - break; - } - default: - return ERR_PTR(-EINVAL); - } - - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); + unsigned long size = sizeof(struct io_uring_reg_wait); + unsigned long offset = (uintptr_t)uarg; + unsigned long end; - return ptr; -} - -#ifdef CONFIG_MMU + if (unlikely(offset % sizeof(long))) + return ERR_PTR(-EFAULT); -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - size_t sz = vma->vm_end - vma->vm_start; - unsigned long pfn; - void *ptr; + /* also protects from NULL ->cq_wait_arg as the size would be 0 */ + if (unlikely(check_add_overflow(offset, size, &end) || + end > ctx->cq_wait_size)) + return ERR_PTR(-EFAULT); - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); + offset = array_index_nospec(offset, ctx->cq_wait_size - size); + return ctx->cq_wait_arg + offset; } -static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, size_t argsz) { - void *ptr; + struct io_uring_getevents_arg arg; - /* - * Do not allow to map to user-provided address to avoid breaking the - * aliasing rules. Userspace is not able to guess the offset address of - * kernel kmalloc()ed memory area. - */ - if (addr) + if (!(flags & IORING_ENTER_EXT_ARG)) + return 0; + if (flags & IORING_ENTER_EXT_ARG_REG) return -EINVAL; - - ptr = io_uring_validate_mmap_request(filp, pgoff, len); - if (IS_ERR(ptr)) - return -ENOMEM; - - /* - * Some architectures have strong cache aliasing requirements. - * For such architectures we need a coherent mapping which aliases - * kernel memory *and* userspace memory. To achieve that: - * - use a NULL file pointer to reference physical memory, and - * - use the kernel virtual address of the shared io_uring context - * (instead of the userspace-provided address, which has to be 0UL - * anyway). - * - use the same pgoff which the get_unmapped_area() uses to - * calculate the page colouring. - * For architectures without such aliasing requirements, the - * architecture will return any suitable mapping because addr is 0. - */ - filp = NULL; - flags |= MAP_SHARED; - pgoff = 0; /* has been translated to ptr above */ -#ifdef SHM_COLOUR - addr = (uintptr_t) ptr; - pgoff = addr >> PAGE_SHIFT; -#else - addr = 0UL; -#endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) -{ - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; - - if (argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - } + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, struct ext_arg *ext_arg) { + const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; /* @@ -3567,8 +3244,29 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; + ext_arg->sig = (const sigset_t __user *) argp; + return 0; + } + + if (flags & IORING_ENTER_EXT_ARG_REG) { + struct io_uring_reg_wait *w; + + if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) + return -EINVAL; + w = io_get_ext_arg_reg(ctx, argp); + if (IS_ERR(w)) + return PTR_ERR(w); + + if (w->flags & ~IORING_REG_WAIT_TS) + return -EINVAL; + ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); + ext_arg->argsz = READ_ONCE(w->sigmask_sz); + if (w->flags & IORING_REG_WAIT_TS) { + ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); + ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); + ext_arg->ts_set = true; + } return 0; } @@ -3576,16 +3274,34 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) + if (ext_arg->argsz != sizeof(arg)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) +#ifdef CONFIG_64BIT + if (!user_access_begin(uarg, sizeof(*uarg))) return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); + unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end); + unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end); + unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end); + unsafe_get_user(arg.ts, &uarg->ts, uaccess_end); + user_access_end(); +#else + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; +#endif + ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(arg.sigmask); + ext_arg->argsz = arg.sigmask_sz; + if (arg.ts) { + if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts))) + return -EFAULT; + ext_arg->ts_set = true; + } return 0; +#ifdef CONFIG_64BIT +uaccess_end: + user_access_end(); + return -EFAULT; +#endif } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, @@ -3598,7 +3314,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) + IORING_ENTER_REGISTERED_RING | + IORING_ENTER_ABS_TIMER | + IORING_ENTER_EXT_ARG_REG))) return -EINVAL; /* @@ -3635,8 +3353,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; @@ -3666,7 +3382,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - (void)io_run_local_work_locked(ctx); + (void)io_run_local_work_locked(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); } @@ -3683,7 +3399,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ mutex_lock(&ctx->uring_lock); iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); + ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); @@ -3691,15 +3407,14 @@ iopoll_locked: } mutex_unlock(&ctx->uring_lock); } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; + struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); + ret2 = io_cqring_wait(ctx, min_complete, flags, + &ext_arg); } } @@ -3725,11 +3440,9 @@ out: static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, + .get_unmapped_area = io_uring_get_unmapped_area, #ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, -#else - .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS @@ -3745,27 +3458,31 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, + &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_mem_alloc(size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); - - if (IS_ERR(rings)) - return PTR_ERR(rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; @@ -3782,17 +3499,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_mem_alloc(size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_rings_free(ctx); - return PTR_ERR(ptr); + return ret; } - - ctx->sq_sqes = ptr; + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } @@ -3819,14 +3537,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +int io_uring_fill_params(unsigned entries, struct io_uring_params *p) { - struct io_ring_ctx *ctx; - struct io_uring_task *tctx; - struct file *file; - int ret; - if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3868,10 +3580,52 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; + + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; + + return 0; +} + +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) +{ + struct io_ring_ctx *ctx; + struct io_uring_task *tctx; + struct file *file; + int ret; + + ret = io_uring_fill_params(entries, p); + if (unlikely(ret)) + return ret; + ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; + ctx->clockid = CLOCK_MONOTONIC; + ctx->clock_offset = 0; + + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_inc(&io_key_has_sqarray); + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3922,6 +3676,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->notify_method = TWA_SIGNAL; } + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == + IORING_SETUP_HYBRID_IOPOLL) + goto err; + /* * For DEFER_TASKRUN we require the completion task to be the same as the * submission task. This implies that there is only one submitter, so enforce @@ -3945,44 +3704,22 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_create(ctx, p); - if (ret) - goto err; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - ret = io_rsrc_init(ctx); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - p->sq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->sq_off.user_addr = 0; - - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - p->cq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->cq_off.user_addr = 0; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | + IORING_FEAT_RW_ATTR; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -4050,7 +3787,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -4085,6 +3822,13 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __init io_uring_init(void) { + struct kmem_cache_args kmem_args = { + .useroffset = offsetof(struct io_kiocb, cmd.data), + .usersize = sizeof_field(struct io_kiocb, cmd.data), + .freeptr_offset = offsetof(struct io_kiocb, work), + .use_freeptr_offset = true, + }; + #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ @@ -4136,6 +3880,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != @@ -4153,7 +3899,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); @@ -4169,15 +3915,13 @@ static int __init io_uring_init(void) * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ - req_cachep = kmem_cache_create_usercopy("io_kiocb", - sizeof(struct io_kiocb), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, - offsetof(struct io_kiocb, cmd.data), - sizeof_field(struct io_kiocb, cmd.data), NULL); - io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, - NULL); + req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU); + io_buf_cachep = KMEM_CACHE(io_buffer, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + + iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d5495710c178..ab619e63ef39 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,11 +5,14 @@ #include <linux/lockdep.h> #include <linux/resume_user_mode.h> #include <linux/kasan.h> +#include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> +#include "alloc_cache.h" #include "io-wq.h" #include "slist.h" #include "filetable.h" +#include "opdef.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -34,28 +37,64 @@ enum { IOU_STOP_MULTISHOT = -ECANCELED, }; +struct io_wait_queue { + struct wait_queue_entry wq; + struct io_ring_ctx *ctx; + unsigned cq_tail; + unsigned cq_min_tail; + unsigned nr_timeouts; + int hit_timeout; + ktime_t min_timeout; + ktime_t timeout; + struct hrtimer t; + +#ifdef CONFIG_NET_RX_BUSY_POLL + ktime_t napi_busy_poll_dt; + bool napi_prefer_busy_poll; +#endif +}; + +static inline bool io_should_wake(struct io_wait_queue *iowq) +{ + struct io_ring_ctx *ctx = iowq->ctx; + int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; + + /* + * Wake up if we have enough events, or if a timeout occurred since we + * started waiting. For timeouts, we always want to return to userspace, + * regardless of event count. + */ + return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; +} + +#define IORING_MAX_ENTRIES 32768 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) + +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset); +int io_uring_fill_params(unsigned entries, struct io_uring_params *p); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); -void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); - struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); +struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, @@ -63,12 +102,12 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); +void io_req_queue_iowq(struct io_kiocb *req); int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); -int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); @@ -78,25 +117,19 @@ void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all); -void *io_mem_alloc(size_t size); -void io_mem_free(void *ptr); - -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - -void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); -#if defined(CONFIG_PROVE_LOCKING) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { +#if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + lockdep_assert_held(&ctx->uring_lock); + if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); } else if (!ctx->task_complete) { @@ -108,23 +141,24 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ - if (ctx->submitter_task->flags & PF_EXITING) - lockdep_assert(current_work()); - else + if (!percpu_ref_is_dying(&ctx->refs)) lockdep_assert(current == ctx->submitter_task); } -} -#else -static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) -{ -} #endif +} static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs) || + ctx->submit_state.cq_flush) + __io_submit_flush_completions(ctx); +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) @@ -164,16 +198,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, if (unlikely(!io_get_cqe(ctx, &cqe))) return false; - if (trace_io_uring_complete_enabled()) - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } + + if (trace_io_uring_complete_enabled()) + trace_io_uring_complete(req->ctx, req, cqe); return true; } @@ -192,6 +225,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, + struct io_kiocb *req) +{ + if (cache) { + req->async_data = io_cache_alloc(cache, GFP_KERNEL); + } else { + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); + } + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; @@ -207,7 +256,7 @@ static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) { lockdep_assert_held(&ctx->uring_lock); - if (issue_flags & IO_URING_F_UNLOCKED) + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_unlock(&ctx->uring_lock); } @@ -220,7 +269,7 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, * The only exception is when we've detached the request and issue it * from an async worker thread, grab the lock for that case. */ - if (issue_flags & IO_URING_F_UNLOCKED) + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_lock(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock); } @@ -259,7 +308,14 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; + /* + * SQPOLL must use the actual sqring head, as using the cached_sq_head + * is race prone if the SQPOLL thread has grabbed entries but not yet + * committed them to the ring. For !SQPOLL, this doesn't matter, but + * since this helper is just used for SQPOLL sqring waits (or POLLOUT), + * just read the actual sqring head unconditionally. + */ + return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -274,6 +330,8 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline int io_run_task_work(void) { + bool ret = false; + /* * Always check-and-clear the task_work notification signal. With how * signaling works for task_work, we can find it set with nothing to @@ -285,31 +343,42 @@ static inline int io_run_task_work(void) * PF_IO_WORKER never returns to userspace, so check here if we have * notify work that needs processing. */ - if (current->flags & PF_IO_WORKER && - test_thread_flag(TIF_NOTIFY_RESUME)) { - __set_current_state(TASK_RUNNING); - resume_user_mode_work(NULL); + if (current->flags & PF_IO_WORKER) { + if (test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + resume_user_mode_work(NULL); + } + if (current->io_uring) { + unsigned int count = 0; + + __set_current_state(TASK_RUNNING); + tctx_task_work_run(current->io_uring, UINT_MAX, &count); + if (count) + ret = true; + } } if (task_work_pending(current)) { __set_current_state(TASK_RUNNING); task_work_run(); - return 1; + ret = true; } - return 0; + return ret; +} + +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); } static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); + return task_work_pending(current) || io_local_work_pending(ctx); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!ts->locked) { - mutex_lock(&ctx->uring_lock); - ts->locked = true; - } + lockdep_assert_held(&ctx->uring_lock); } /* @@ -381,6 +450,19 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) ctx->submitter_task == current); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + */ +static inline bool io_should_terminate_tw(void) +{ + return current->flags & (PF_KTHREAD | PF_EXITING); +} + static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); @@ -398,4 +480,34 @@ static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) return 2 * sizeof(struct io_uring_sqe); return sizeof(struct io_uring_sqe); } + +static inline bool io_file_can_poll(struct io_kiocb *req) +{ + if (req->flags & REQ_F_CAN_POLL) + return true; + if (req->file && file_can_poll(req->file)) { + req->flags |= REQ_F_CAN_POLL; + return true; + } + return false; +} + +static inline ktime_t io_get_time(struct io_ring_ctx *ctx) +{ + if (ctx->clockid == CLOCK_MONOTONIC) + return ktime_get(); + + return ktime_get_with_offset(ctx->clock_offset); +} + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + +static inline bool io_has_work(struct io_ring_ctx *ctx) +{ + return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || + io_local_work_pending(ctx); +} #endif diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 18df5a9d2f5e..8e72de7712ac 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/vmalloc.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -14,10 +15,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) - -#define BGID_ARRAY 64 +#include "memmap.h" /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) @@ -33,29 +31,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; - -static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, - unsigned int bgid) -{ - if (bl && bgid < BGID_ARRAY) - return &bl[bgid]; - - return xa_load(&ctx->io_bl_xa, bgid); -} - static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, ctx->io_bl, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -64,14 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - smp_store_release(&bl->is_ready, 1); - - if (bgid < BGID_ARRAY) - return 0; - + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -81,15 +58,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) struct io_buffer_list *bl; struct io_buffer *buf; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if (req->flags & REQ_F_PARTIAL_IO) - return false; - io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; @@ -102,10 +70,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - unsigned int cflags; - /* * We can add this buffer back to two lists: * @@ -118,21 +84,17 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) * We migrate buffers from the comp_list to the issue cache list * when we need one. */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf_list(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { + if (issue_flags & IO_URING_F_UNLOCKED) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); + __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { lockdep_assert_held(&req->ctx->uring_lock); - cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); } - return cflags; } static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, @@ -145,6 +107,8 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, list_del(&kbuf->list); if (*len == 0 || *len > kbuf->len) *len = kbuf->len; + if (list_empty(&bl->buf_list)) + req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; req->buf_index = kbuf->bid; @@ -153,34 +117,46 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + struct iovec *iov) +{ + void __user *buf; + + buf = io_provided_buffer_select(req, len, bl); + if (unlikely(!buf)) + return -ENOBUFS; + + iov[0].iov_base = buf; + iov[0].iov_len = *len; + return 1; +} + static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) { struct io_uring_buf_ring *br = bl->buf_ring; + __u16 tail, head = bl->head; struct io_uring_buf *buf; - __u16 head = bl->head; + void __user *ret; - if (unlikely(smp_load_acquire(&br->tail) == head)) + tail = smp_load_acquire(&br->tail); + if (unlikely(tail == head)) return NULL; - head &= bl->mask; - /* mmaped buffers are always contig */ - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } + if (head + 1 == tail) + req->flags |= REQ_F_BL_EMPTY; + + buf = io_ring_head_to_buf(br, head, bl->mask); if (*len == 0 || *len > buf->len) *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { + if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* * If we came in unlocked, we have no choice but to consume the * buffer here, otherwise nothing ensures that the buffer won't @@ -191,10 +167,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ + io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; - bl->head++; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, @@ -208,7 +184,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->is_mapped) + if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -217,40 +193,153 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +/* cap it at a reasonable 256, will be one page even for 4K */ +#define PEEK_MAX_IMPORT 256 + +static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_buffer_list *bl) { - struct io_buffer_list *bl; - int i; + struct io_uring_buf_ring *br = bl->buf_ring; + struct iovec *iov = arg->iovs; + int nr_iovs = arg->nr_iovs; + __u16 nr_avail, tail, head; + struct io_uring_buf *buf; - bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); - if (!bl) - return -ENOMEM; + tail = smp_load_acquire(&br->tail); + head = bl->head; + nr_avail = min_t(__u16, tail - head, UIO_MAXIOV); + if (unlikely(!nr_avail)) + return -ENOBUFS; + + buf = io_ring_head_to_buf(br, head, bl->mask); + if (arg->max_len) { + u32 len = READ_ONCE(buf->len); - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&bl[i].buf_list); - bl[i].bgid = i; + if (unlikely(!len)) + return -ENOBUFS; + /* + * Limit incremental buffers to 1 segment. No point trying + * to peek ahead and map more than we need, when the buffers + * themselves should be large when setup with + * IOU_PBUF_RING_INC. + */ + if (bl->flags & IOBL_INC) { + nr_avail = 1; + } else { + size_t needed; + + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } } - smp_store_release(&ctx->io_bl, bl); - return 0; + /* + * only alloc a bigger array if we know we have data to map, eg not + * a speculative peek operation. + */ + if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + if (arg->mode & KBUF_MODE_FREE) + kfree(arg->iovs); + arg->iovs = iov; + nr_iovs = nr_avail; + } else if (nr_avail < nr_iovs) { + nr_iovs = nr_avail; + } + + /* set it to max, if not set, so we can use it unconditionally */ + if (!arg->max_len) + arg->max_len = INT_MAX; + + req->buf_index = buf->bid; + do { + u32 len = buf->len; + + /* truncate end piece, if needed, for non partial buffers */ + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) + buf->len = len; + } + + iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_len = len; + iov++; + + arg->out_len += len; + arg->max_len -= len; + if (!arg->max_len) + break; + + buf = io_ring_head_to_buf(br, ++head, bl->mask); + } while (--nr_iovs); + + if (head == tail) + req->flags |= REQ_F_BL_EMPTY; + + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + return iov - arg->iovs; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags) { - struct io_buf_free *ibf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = -ENOENT; + + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + goto out_unlock; - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; + if (bl->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, bl); + /* + * Don't recycle these buffers if we need to go through poll. + * Nobody else can use them anyway, and holding on to provided + * buffers for a send/write operation would happen on the app + * side anyway with normal buffers. Besides, we already + * committed them, they cannot be put back in the queue. + */ + if (ret > 0) { + req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; + io_kbuf_commit(req, bl, arg->out_len, ret); } + } else { + ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + } +out_unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + return -ENOENT; + + if (bl->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, bl); + if (ret > 0) + req->flags |= REQ_F_BUFFERS_COMMIT; + return ret; } - /* can't happen... */ - WARN_ON_ONCE(1); + /* don't support multiple buffer selections for legacy */ + return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } static int __io_remove_buffers(struct io_ring_ctx *ctx, @@ -262,28 +351,12 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_mapped) { + if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { - int j; - - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - } + io_free_region(ctx, &bl->region); /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); - bl->is_mapped = 0; + bl->flags &= ~IOBL_BUF_RING; return i; } @@ -303,24 +376,29 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); +} + void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; struct list_head *item, *tmp; struct io_buffer *buf; - unsigned long index; - int i; - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } + while (1) { + unsigned long index = 0; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; + io_put_bl(ctx, bl); } /* @@ -337,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) } } +static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); + io_put_bl(ctx, bl); +} + int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); @@ -370,7 +455,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->is_mapped) + if (!(bl->flags & IOBL_BUF_RING)) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -498,12 +583,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); @@ -514,21 +593,12 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. Also can't - * be a lower indexed array group, as adding one - * where lookup failed cannot happen. - */ - if (p->bgid >= BGID_ARRAY) - kfree_rcu(bl, rcu); - else - WARN_ON_ONCE(1); + kfree(bl); goto err; } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->is_mapped) { + if (bl->flags & IOBL_BUF_RING) { ret = -EINVAL; goto err; } @@ -543,123 +613,14 @@ err: return IOU_OK; } -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br; - struct page **pages; - int i, nr_pages; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - /* - * Apparently some 32-bit boxes (ARM) will return highmem pages, - * which then need to be mapped. We could support that, but it'd - * complicate the code and slowdown the common cases quite a bit. - * So just error out, returning -EINVAL just like we did on kernels - * that didn't support mapped buffer rings. - */ - for (i = 0; i < nr_pages; i++) - if (PageHighMem(pages[i])) - goto error_unpin; - - br = page_address(pages[0]); -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) - goto error_unpin; -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->is_mapped = 1; - bl->is_mmap = 0; - return 0; -error_unpin: - for (i = 0; i < nr_pages; i++) - unpin_user_page(pages[i]); - kvfree(pages); - return -EINVAL; -} - -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_buf_free *ibf; - size_t ring_size; - void *ptr; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); - } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; - bl->is_mapped = 1; - bl->is_mmap = 1; - return 0; -} - int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); @@ -669,56 +630,68 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags & ~IOU_PBUF_RING_MMAP) + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->is_mapped || !list_empty(&bl->buf_list)) + if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; + io_destroy_bl(ctx, bl); } - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); + +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; } +#endif - kfree_rcu(free_bl, rcu); + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); + kfree(free_bl); return ret; } @@ -739,14 +712,13 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->is_mapped) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { + scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); - } + + io_put_bl(ctx, bl); return 0; } @@ -766,7 +738,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) return -ENOENT; - if (!bl->is_mapped) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; buf_status.head = bl->head; @@ -776,37 +748,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { struct io_buffer_list *bl; - bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); + lockdep_assert_held(&ctx->mmap_lock); - if (!bl || !bl->is_mmap) - return NULL; - /* - * Ensure the list is fully setup. Only strictly needed for RCU lookup - * via mmap, and in that case only for the array indexed groups. For - * the xarray lookups, it's either visible and ready, or not at all. - */ - if (!smp_load_acquire(&bl->is_ready)) + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) return NULL; - - return bl->buf_ring; -} - -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) -{ - struct io_buf_free *ibf; - struct hlist_node *tmp; - - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + return &bl->region; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 53dfaa71a397..bd80c44c5af1 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -3,6 +3,14 @@ #define IOU_KBUF_H #include <uapi/linux/io_uring.h> +#include <linux/io_uring_types.h> + +enum { + /* ring mapped provided buffers */ + IOBL_BUF_RING = 1, + /* buffers are consumed incrementally rather than always fully */ + IOBL_INC = 2, +}; struct io_buffer_list { /* @@ -11,11 +19,7 @@ struct io_buffer_list { */ union { struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - struct rcu_head rcu; + struct io_uring_buf_ring *buf_ring; }; __u16 bgid; @@ -25,12 +29,9 @@ struct io_buffer_list { __u16 head; __u16 mask; - /* ring mapped provided buffers */ - __u8 is_mapped; - /* ring mapped provided buffers, but mmap'ed by application */ - __u8 is_mmap; - /* bl is visible from an RCU point of view for lookup */ - __u8 is_ready; + __u16 flags; + + struct io_mapped_region region; }; struct io_buffer { @@ -41,8 +42,26 @@ struct io_buffer { __u16 bgid; }; +enum { + /* can alloc a bigger vec */ + KBUF_MODE_EXPAND = 1, + /* if bigger vec allocated, free old one */ + KBUF_MODE_FREE = 2, +}; + +struct buf_sel_arg { + struct iovec *iovs; + size_t out_len; + size_t max_len; + unsigned short nr_iovs; + unsigned short mode; +}; + void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -55,13 +74,12 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); - -unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { @@ -73,21 +91,9 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) * to monopolize the buffer. */ if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - /* - * If we end up here, then the io_uring_lock has - * been kept held since we retrieved the buffer. - * For the io-wq case, we already cleared - * req->buf_list when the buffer was retrieved, - * hence it cannot be set here for that case. - */ - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - return true; - } + req->buf_index = req->buf_list->bgid; + req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); + return true; } return false; } @@ -101,6 +107,8 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { + if (req->flags & REQ_F_BL_NO_RECYCLE) + return false; if (req->flags & REQ_F_BUFFER_SELECTED) return io_kbuf_recycle_legacy(req, issue_flags); if (req->flags & REQ_F_BUFFER_RING) @@ -108,41 +116,100 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, - struct list_head *list) +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] + +static inline bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) { - unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->buf_list->head++; + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + + if (bl->flags & IOBL_INC) { + struct io_uring_buf *buf; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + if (WARN_ON_ONCE(len > buf->len)) + len = buf->len; + buf->len -= len; + if (buf->len) { + buf->addr += len; + return false; } - req->flags &= ~REQ_F_BUFFER_RING; + } + + bl->head += nr; + return true; +} + +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) +{ + struct io_buffer_list *bl = req->buf_list; + bool ret = true; + + if (bl) { + ret = io_kbuf_commit(req, bl, len, nr); + req->buf_index = bl->bgid; + } + req->flags &= ~REQ_F_BUFFER_RING; + return ret; +} + +static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, + struct list_head *list) +{ + if (req->flags & REQ_F_BUFFER_RING) { + __io_put_kbuf_ring(req, len, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); req->flags &= ~REQ_F_BUFFER_SELECTED; } - - return ret; } -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +static inline void io_kbuf_drop(struct io_kiocb *req) { lockdep_assert_held(&req->ctx->completion_lock); if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return; + + /* len == 0 is fine here, non-ring will always drop all of it */ + __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); +} + +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, + int nbufs, unsigned issue_flags) +{ + unsigned int ret; + + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + if (req->flags & REQ_F_BUFFER_RING) { + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + } else { + __io_put_kbuf(req, len, issue_flags); + } + return ret; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, +static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { + return __io_put_kbufs(req, len, 1, issue_flags); +} - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - return __io_put_kbuf(req, issue_flags); +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, + int nbufs, unsigned issue_flags) +{ + return __io_put_kbufs(req, len, nbufs, issue_flags); } #endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c new file mode 100644 index 000000000000..361134544427 --- /dev/null +++ b/io_uring/memmap.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> +#include <asm/shmparam.h> + +#include "memmap.h" +#include "kbuf.h" +#include "rsrc.h" + +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); +} + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + if (check_add_overflow(uaddr, len, &end)) + return ERR_PTR(-EOVERFLOW); + if (check_add_overflow(end, PAGE_SIZE - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + end = end >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(nr_pages > INT_MAX)) + return ERR_PTR(-EOVERFLOW); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + +enum { + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ + IO_REGION_F_VMAP = 1, + /* memory is provided by user and pinned by the kernel */ + IO_REGION_F_USER_PROVIDED = 2, + /* only the first page in the array is ref'ed */ + IO_REGION_F_SINGLE_REF = 4, +}; + +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +{ + if (mr->pages) { + long nr_refs = mr->nr_pages; + + if (mr->flags & IO_REGION_F_SINGLE_REF) + nr_refs = 1; + + if (mr->flags & IO_REGION_F_USER_PROVIDED) + unpin_user_pages(mr->pages, nr_refs); + else + release_pages(mr->pages, nr_refs); + + kvfree(mr->pages); + } + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) + vunmap(mr->ptr); + if (mr->nr_pages && ctx->user) + __io_unaccount_mem(ctx->user, mr->nr_pages); + + memset(mr, 0, sizeof(*mr)); +} + +static int io_region_init_ptr(struct io_mapped_region *mr) +{ + struct io_imu_folio_data ifd; + void *ptr; + + if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { + if (ifd.nr_folios == 1) { + mr->ptr = page_address(mr->pages[0]); + return 0; + } + } + ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); + if (!ptr) + return -ENOMEM; + + mr->ptr = ptr; + mr->flags |= IO_REGION_F_VMAP; + return 0; +} + +static int io_region_pin_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + unsigned long size = mr->nr_pages << PAGE_SHIFT; + struct page **pages; + int nr_pages; + + pages = io_pin_pages(reg->user_addr, size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) + return -EFAULT; + + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; + return 0; +} + +static int io_region_allocate_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + unsigned long size = mr->nr_pages << PAGE_SHIFT; + unsigned long nr_allocated; + struct page **pages; + void *p; + + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + if (!pages) + return -ENOMEM; + + p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); + if (!IS_ERR(p)) { + mr->flags |= IO_REGION_F_SINGLE_REF; + goto done; + } + + nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE, + mr->nr_pages, pages); + if (nr_allocated != mr->nr_pages) { + if (nr_allocated) + release_pages(pages, nr_allocated); + kvfree(pages); + return -ENOMEM; + } +done: + reg->mmap_offset = mmap_offset; + mr->pages = pages; + return 0; +} + +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + int nr_pages, ret; + u64 end; + + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) + return -EFAULT; + if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) + return -EINVAL; + if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) + return -EINVAL; + /* user_addr should be set IFF it's a user memory backed region */ + if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) + return -EFAULT; + if (!reg->size || reg->mmap_offset || reg->id) + return -EINVAL; + if ((reg->size >> PAGE_SHIFT) > INT_MAX) + return -E2BIG; + if ((reg->user_addr | reg->size) & ~PAGE_MASK) + return -EINVAL; + if (check_add_overflow(reg->user_addr, reg->size, &end)) + return -EOVERFLOW; + + nr_pages = reg->size >> PAGE_SHIFT; + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + mr->nr_pages = nr_pages; + + if (reg->flags & IORING_MEM_REGION_TYPE_USER) + ret = io_region_pin_pages(ctx, mr, reg); + else + ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); + if (ret) + goto out_free; + + ret = io_region_init_ptr(mr); + if (ret) + goto out_free; + return 0; +out_free: + io_free_region(ctx, mr); + return ret; +} + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + struct io_mapped_region tmp_mr; + int ret; + + memcpy(&tmp_mr, mr, sizeof(tmp_mr)); + ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); + if (ret) + return ret; + + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + memcpy(mr, &tmp_mr, sizeof(tmp_mr)); + return 0; +} + +static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, + loff_t pgoff) +{ + loff_t offset = pgoff << PAGE_SHIFT; + unsigned int bgid; + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return &ctx->ring_region; + case IORING_OFF_SQES: + return &ctx->sq_region; + case IORING_OFF_PBUF_RING: + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, bgid); + case IORING_MAP_OFF_PARAM_REGION: + return &ctx->param_region; + } + return NULL; +} + +static void *io_region_validate_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr) +{ + lockdep_assert_held(&ctx->mmap_lock); + + if (!io_region_is_set(mr)) + return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + return ERR_PTR(-EINVAL); + + return io_region_get_ptr(mr); +} + +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, + size_t sz) +{ + struct io_ring_ctx *ctx = file->private_data; + struct io_mapped_region *region; + + region = io_mmap_get_region(ctx, pgoff); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); +} + +#ifdef CONFIG_MMU + +static int io_region_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct vm_area_struct *vma, + unsigned max_pages) +{ + unsigned long nr_pages = min(mr->nr_pages, max_pages); + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); +} + +__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct io_ring_ctx *ctx = file->private_data; + size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int page_limit = UINT_MAX; + struct io_mapped_region *region; + void *ptr; + + guard(mutex)(&ctx->mmap_lock); + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + break; + } + + region = io_mmap_get_region(ctx, vma->vm_pgoff); + return io_region_mmap(ctx, region, vma, page_limit); +} + +unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct io_ring_ctx *ctx = filp->private_data; + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + guard(mutex)(&ctx->mmap_lock); + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + /* + * Some architectures have strong cache aliasing requirements. + * For such architectures we need a coherent mapping which aliases + * kernel memory *and* userspace memory. To achieve that: + * - use a NULL file pointer to reference physical memory, and + * - use the kernel virtual address of the shared io_uring context + * (instead of the userspace-provided address, which has to be 0UL + * anyway). + * - use the same pgoff which the get_unmapped_area() uses to + * calculate the page colouring. + * For architectures without such aliasing requirements, the + * architecture will return any suitable mapping because addr is 0. + */ + filp = NULL; + flags |= MAP_SHARED; + pgoff = 0; /* has been translated to ptr above */ +#ifdef SHM_COLOUR + addr = (uintptr_t) ptr; + pgoff = addr >> PAGE_SHIFT; +#else + addr = 0UL; +#endif + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); +} + +#else /* !CONFIG_MMU */ + +int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; +} + +unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct io_ring_ctx *ctx = file->private_data; + void *ptr; + + guard(mutex)(&ctx->mmap_lock); + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ diff --git a/io_uring/memmap.h b/io_uring/memmap.h new file mode 100644 index 000000000000..c898dcba2b4e --- /dev/null +++ b/io_uring/memmap.h @@ -0,0 +1,36 @@ +#ifndef IO_URING_MEMMAP_H +#define IO_URING_MEMMAP_H + +#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); + +#ifndef CONFIG_MMU +unsigned int io_uring_nommu_mmap_capabilities(struct file *file); +#endif +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +int io_uring_mmap(struct file *file, struct vm_area_struct *vma); + +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +static inline void *io_region_get_ptr(struct io_mapped_region *mr) +{ + return mr->ptr; +} + +static inline bool io_region_is_set(struct io_mapped_region *mr) +{ + return !!mr->nr_pages; +} + +#endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index cd6dcf634ba3..7e6f68e911f1 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,9 +11,9 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" +#include "alloc_cache.h" #include "msg_ring.h" - /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) @@ -68,65 +68,74 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - if (!target_ctx->task_complete) - return false; - return current != target_ctx->submitter_task; + return target_ctx->task_complete; } -static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) +static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) { - struct io_ring_ctx *ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); + struct io_ring_ctx *ctx = req->ctx; - if (unlikely(!task)) - return -EOWNERDEAD; + io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); + if (spin_trylock(&ctx->msg_lock)) { + if (io_alloc_cache_put(&ctx->msg_cache, req)) + req = NULL; + spin_unlock(&ctx->msg_lock); + } + if (req) + kmem_cache_free(req_cachep, req); + percpu_ref_put(&ctx->refs); +} - init_task_work(&msg->tw, func); - if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL)) +static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) +{ + if (!READ_ONCE(ctx->submitter_task)) { + kmem_cache_free(req_cachep, req); return -EOWNERDEAD; - - return IOU_ISSUE_SKIP_COMPLETE; + } + req->cqe.user_data = user_data; + io_req_set_res(req, res, cflags); + percpu_ref_get(&ctx->refs); + req->ctx = ctx; + req->tctx = NULL; + req->io_task_work.func = io_msg_tw_complete; + io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + return 0; } -static void io_msg_tw_complete(struct callback_head *head) +static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) { - struct io_msg *msg = container_of(head, struct io_msg, tw); - struct io_kiocb *req = cmd_to_io_kiocb(msg); - struct io_ring_ctx *target_ctx = req->file->private_data; - int ret = 0; - - if (current->flags & PF_EXITING) { - ret = -EOWNERDEAD; - } else { - u32 flags = 0; - - if (msg->flags & IORING_MSG_RING_FLAGS_PASS) - flags = msg->cqe_flags; - - /* - * If the target ring is using IOPOLL mode, then we need to be - * holding the uring_lock for posting completions. Other ring - * types rely on the regular completion locking, which is - * handled while posting. - */ - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = -EOVERFLOW; - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&target_ctx->uring_lock); + struct io_kiocb *req = NULL; + + if (spin_trylock(&ctx->msg_lock)) { + req = io_alloc_cache_get(&ctx->msg_cache); + spin_unlock(&ctx->msg_lock); + if (req) + return req; } + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); +} - if (ret < 0) - req_set_fail(req); - io_req_queue_tw_complete(req, ret); +static int io_msg_data_remote(struct io_ring_ctx *target_ctx, + struct io_msg *msg) +{ + struct io_kiocb *target; + u32 flags = 0; + + target = io_msg_get_kiocb(target_ctx); + if (unlikely(!target)) + return -ENOMEM; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + + return io_msg_remote_post(target_ctx, target, msg->len, flags, + msg->user_data); } -static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, + struct io_msg *msg, unsigned int issue_flags) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); u32 flags = 0; int ret; @@ -138,7 +147,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_complete); + return io_msg_data_remote(target_ctx, msg); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -147,32 +156,40 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; - io_double_unlock_ctx(target_ctx); - } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; } + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) + ret = 0; + if (target_ctx->flags & IORING_SETUP_IOPOLL) + io_double_unlock_ctx(target_ctx); return ret; } -static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) +static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + + return __io_msg_ring_data(target_ctx, msg, issue_flags); +} + +static int io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; - struct file *file = NULL; - int idx = msg->src_fd; + struct io_rsrc_node *node; + int ret = -EBADF; io_ring_submit_lock(ctx, issue_flags); - if (likely(idx < ctx->nr_user_files)) { - idx = array_index_nospec(idx, ctx->nr_user_files); - file = io_file_from_index(&ctx->file_table, idx); - if (file) - get_file(file); + node = io_rsrc_node_lookup(&ctx->file_table.data, msg->src_fd); + if (node) { + msg->src_file = io_slot_file(node); + if (msg->src_file) + get_file(msg->src_file); + req->flags |= REQ_F_NEED_CLEANUP; + ret = 0; } io_ring_submit_unlock(ctx, issue_flags); - return file; + return ret; } static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags) @@ -220,12 +237,27 @@ static void io_msg_tw_fd_complete(struct callback_head *head) io_req_queue_tw_complete(req, ret); } +static int io_msg_fd_remote(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct task_struct *task = READ_ONCE(ctx->submitter_task); + + if (unlikely(!task)) + return -EOWNERDEAD; + + init_task_work(&msg->tw, io_msg_tw_fd_complete); + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; +} + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; - struct file *src_file = msg->src_file; if (msg->len) return -EINVAL; @@ -233,23 +265,19 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; - if (!src_file) { - src_file = io_msg_grab_file(req, issue_flags); - if (!src_file) - return -EBADF; - msg->src_file = src_file; - req->flags |= REQ_F_NEED_CLEANUP; + if (!msg->src_file) { + int ret = io_msg_grab_file(req, issue_flags); + if (unlikely(ret)) + return ret; } if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_fd_complete); + return io_msg_fd_remote(req); return io_msg_install_complete(req, issue_flags); } -int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_msg_ring_prep(struct io_msg *msg, const struct io_uring_sqe *sqe) { - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; @@ -266,6 +294,11 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_msg_ring_prep(io_kiocb_to_cmd(req, struct io_msg), sqe); +} + int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); @@ -296,3 +329,28 @@ done: io_req_set_res(req, ret, 0); return IOU_OK; } + +int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) +{ + struct io_msg io_msg = { }; + int ret; + + ret = __io_msg_ring_prep(&io_msg, sqe); + if (unlikely(ret)) + return ret; + + /* + * Only data sending supported, not IORING_MSG_SEND_FD as that one + * doesn't make sense without a source ring to send files from. + */ + if (io_msg.cmd != IORING_MSG_DATA) + return -EINVAL; + + CLASS(fd, f)(sqe->fd); + if (fd_empty(f)) + return -EBADF; + if (!io_is_uring_fops(fd_file(f))) + return -EBADFD; + return __io_msg_ring_data(fd_file(f)->private_data, + &io_msg, IO_URING_F_UNLOCKED); +} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 3987ee6c0e5f..32236d2fb778 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +int io_uring_sync_msg_ring(struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); diff --git a/io_uring/napi.c b/io_uring/napi.c new file mode 100644 index 000000000000..b1ade3fda30f --- /dev/null +++ b/io_uring/napi.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "io_uring.h" +#include "napi.h" + +#ifdef CONFIG_NET_RX_BUSY_POLL + +/* Timeout for cleanout of stale entries. */ +#define NAPI_TIMEOUT (60 * SEC_CONVERSION) + +struct io_napi_entry { + unsigned int napi_id; + struct list_head list; + + unsigned long timeout; + struct hlist_node node; + + struct rcu_head rcu; +}; + +static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, + unsigned int napi_id) +{ + struct io_napi_entry *e; + + hlist_for_each_entry_rcu(e, hash_list, node) { + if (e->napi_id != napi_id) + continue; + return e; + } + + return NULL; +} + +static inline ktime_t net_to_ktime(unsigned long t) +{ + /* napi approximating usecs, reverse busy_loop_current_time */ + return ns_to_ktime(t << 10); +} + +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) +{ + struct hlist_head *hash_list; + struct io_napi_entry *e; + + /* Non-NAPI IDs can be rejected. */ + if (napi_id < MIN_NAPI_ID) + return -EINVAL; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + + scoped_guard(rcu) { + e = io_napi_hash_find(hash_list, napi_id); + if (e) { + WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); + return -EEXIST; + } + } + + e = kmalloc(sizeof(*e), GFP_NOWAIT); + if (!e) + return -ENOMEM; + + e->napi_id = napi_id; + e->timeout = jiffies + NAPI_TIMEOUT; + + /* + * guard(spinlock) is not used to manually unlock it before calling + * kfree() + */ + spin_lock(&ctx->napi_lock); + if (unlikely(io_napi_hash_find(hash_list, napi_id))) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return -EEXIST; + } + + hlist_add_tail_rcu(&e->node, hash_list); + list_add_tail_rcu(&e->list, &ctx->napi_list); + spin_unlock(&ctx->napi_lock); + return 0; +} + +static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) +{ + struct hlist_head *hash_list; + struct io_napi_entry *e; + + /* Non-NAPI IDs can be rejected. */ + if (napi_id < MIN_NAPI_ID) + return -EINVAL; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + guard(spinlock)(&ctx->napi_lock); + e = io_napi_hash_find(hash_list, napi_id); + if (!e) + return -ENOENT; + + list_del_rcu(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + return 0; +} + +static void __io_napi_remove_stale(struct io_ring_ctx *ctx) +{ + struct io_napi_entry *e; + + guard(spinlock)(&ctx->napi_lock); + /* + * list_for_each_entry_safe() is not required as long as: + * 1. list_del_rcu() does not reset the deleted node next pointer + * 2. kfree_rcu() delays the memory freeing until the next quiescent + * state + */ + list_for_each_entry(e, &ctx->napi_list, list) { + if (time_after(jiffies, READ_ONCE(e->timeout))) { + list_del_rcu(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + } + } +} + +static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) +{ + if (is_stale) + __io_napi_remove_stale(ctx); +} + +static inline bool io_napi_busy_loop_timeout(ktime_t start_time, + ktime_t bp) +{ + if (bp) { + ktime_t end_time = ktime_add(start_time, bp); + ktime_t now = net_to_ktime(busy_loop_current_time()); + + return ktime_after(now, end_time); + } + + return true; +} + +static bool io_napi_busy_loop_should_end(void *data, + unsigned long start_time) +{ + struct io_wait_queue *iowq = data; + + if (signal_pending(current)) + return true; + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) + return true; + if (io_napi_busy_loop_timeout(net_to_ktime(start_time), + iowq->napi_busy_poll_dt)) + return true; + + return false; +} + +/* + * never report stale entries + */ +static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + struct io_napi_entry *e; + + list_for_each_entry_rcu(e, &ctx->napi_list, list) + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + return false; +} + +static bool +dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + struct io_napi_entry *e; + bool is_stale = false; + + list_for_each_entry_rcu(e, &ctx->napi_list, list) { + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + + if (time_after(jiffies, READ_ONCE(e->timeout))) + is_stale = true; + } + + return is_stale; +} + +static inline bool +__io_napi_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); +} + +static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ + unsigned long start_time = busy_loop_current_time(); + bool (*loop_end)(void *, unsigned long) = NULL; + void *loop_end_arg = NULL; + bool is_stale = false; + + /* Singular lists use a different napi loop end check function and are + * only executed once. + */ + if (list_is_singular(&ctx->napi_list)) { + loop_end = io_napi_busy_loop_should_end; + loop_end_arg = iowq; + } + + scoped_guard(rcu) { + do { + is_stale = __io_napi_do_busy_loop(ctx, loop_end, + loop_end_arg); + } while (!io_napi_busy_loop_should_end(iowq, start_time) && + !loop_end_arg); + } + + io_napi_remove_stale(ctx, is_stale); +} + +/* + * io_napi_init() - Init napi settings + * @ctx: pointer to io-uring context structure + * + * Init napi settings in the io-uring context. + */ +void io_napi_init(struct io_ring_ctx *ctx) +{ + u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; + + INIT_LIST_HEAD(&ctx->napi_list); + spin_lock_init(&ctx->napi_lock); + ctx->napi_prefer_busy_poll = false; + ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); + ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; +} + +/* + * io_napi_free() - Deallocate napi + * @ctx: pointer to io-uring context structure + * + * Free the napi list and the hash table in the io-uring context. + */ +void io_napi_free(struct io_ring_ctx *ctx) +{ + struct io_napi_entry *e; + + guard(spinlock)(&ctx->napi_lock); + list_for_each_entry(e, &ctx->napi_list, list) { + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + } + INIT_LIST_HEAD_RCU(&ctx->napi_list); +} + +static int io_napi_register_napi(struct io_ring_ctx *ctx, + struct io_uring_napi *napi) +{ + switch (napi->op_param) { + case IO_URING_NAPI_TRACKING_DYNAMIC: + case IO_URING_NAPI_TRACKING_STATIC: + break; + default: + return -EINVAL; + } + /* clean the napi list for new settings */ + io_napi_free(ctx); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); + WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + return 0; +} + +/* + * io_napi_register() - Register napi with io-uring + * @ctx: pointer to io-uring context structure + * @arg: pointer to io_uring_napi structure + * + * Register napi in the io-uring context. + */ +int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + const struct io_uring_napi curr = { + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), + .prefer_busy_poll = ctx->napi_prefer_busy_poll, + .op_param = ctx->napi_track_mode + }; + struct io_uring_napi napi; + + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; + if (copy_from_user(&napi, arg, sizeof(napi))) + return -EFAULT; + if (napi.pad[0] || napi.pad[1] || napi.resv) + return -EINVAL; + + if (copy_to_user(arg, &curr, sizeof(curr))) + return -EFAULT; + + switch (napi.opcode) { + case IO_URING_NAPI_REGISTER_OP: + return io_napi_register_napi(ctx, &napi); + case IO_URING_NAPI_STATIC_ADD_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_add_id(ctx, napi.op_param); + case IO_URING_NAPI_STATIC_DEL_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_del_id(ctx, napi.op_param); + default: + return -EINVAL; + } +} + +/* + * io_napi_unregister() - Unregister napi with io-uring + * @ctx: pointer to io-uring context structure + * @arg: pointer to io_uring_napi structure + * + * Unregister napi. If arg has been specified copy the busy poll timeout and + * prefer busy poll setting to the passed in structure. + */ +int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + const struct io_uring_napi curr = { + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), + .prefer_busy_poll = ctx->napi_prefer_busy_poll + }; + + if (arg && copy_to_user(arg, &curr, sizeof(curr))) + return -EFAULT; + + WRITE_ONCE(ctx->napi_busy_poll_dt, 0); + WRITE_ONCE(ctx->napi_prefer_busy_poll, false); + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); + return 0; +} + +/* + * __io_napi_busy_loop() - execute busy poll loop + * @ctx: pointer to io-uring context structure + * @iowq: pointer to io wait queue + * + * Execute the busy poll loop and merge the spliced off list. + */ +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) +{ + if (ctx->flags & IORING_SETUP_SQPOLL) + return; + + iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); + if (iowq->timeout != KTIME_MAX) { + ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); + + iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); + } + + iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + io_napi_blocking_busy_loop(ctx, iowq); +} + +/* + * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll + * @ctx: pointer to io-uring context structure + * + * Splice of the napi list and execute the napi busy poll loop. + */ +int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) +{ + bool is_stale = false; + + if (!READ_ONCE(ctx->napi_busy_poll_dt)) + return 0; + if (list_empty_careful(&ctx->napi_list)) + return 0; + + scoped_guard(rcu) { + is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); + } + + io_napi_remove_stale(ctx, is_stale); + return 1; +} + +#endif diff --git a/io_uring/napi.h b/io_uring/napi.h new file mode 100644 index 000000000000..fa742f42e09b --- /dev/null +++ b/io_uring/napi.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef IOU_NAPI_H +#define IOU_NAPI_H + +#include <linux/kernel.h> +#include <linux/io_uring.h> +#include <net/busy_poll.h> + +#ifdef CONFIG_NET_RX_BUSY_POLL + +void io_napi_init(struct io_ring_ctx *ctx); +void io_napi_free(struct io_ring_ctx *ctx); + +int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); +int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); + +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); + +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); +int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); + +static inline bool io_napi(struct io_ring_ctx *ctx) +{ + return !list_empty(&ctx->napi_list); +} + +static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ + if (!io_napi(ctx)) + return; + __io_napi_busy_loop(ctx, iowq); +} + +/* + * io_napi_add() - Add napi id to the busy poll list + * @req: pointer to io_kiocb request + * + * Add the napi id of the socket to the napi busy poll list and hash table. + */ +static inline void io_napi_add(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct socket *sock; + + if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) + return; + + sock = sock_from_file(req->file); + if (sock && sock->sk) + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); +} + +#else + +static inline void io_napi_init(struct io_ring_ctx *ctx) +{ +} +static inline void io_napi_free(struct io_ring_ctx *ctx) +{ +} +static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static inline bool io_napi(struct io_ring_ctx *ctx) +{ + return false; +} +static inline void io_napi_add(struct io_kiocb *req) +{ +} +static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ +} +static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +#endif diff --git a/io_uring/net.c b/io_uring/net.c index 161622029147..50e8a3ccc9de 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -28,6 +28,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + int iou_flags; u32 file_slot; unsigned long nofile; }; @@ -50,6 +51,16 @@ struct io_connect { bool seen_econnaborted; }; +struct io_bind { + struct file *file; + int addr_len; +}; + +struct io_listen { + struct file *file; + int backlog; +}; + struct io_sr_msg { struct file *file; union { @@ -57,15 +68,14 @@ struct io_sr_msg { struct user_msghdr __user *umsg; void __user *buf; }; - unsigned len; + int len; unsigned done_io; unsigned msg_flags; unsigned nr_multishot_loops; u16 flags; /* initialised and used only by !msg send variants */ - u16 addr_len; u16 buf_group; - void __user *addr; + u16 buf_index; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -78,19 +88,6 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 -static inline bool io_check_multishot(struct io_kiocb *req, - unsigned int issue_flags) -{ - /* - * When ->locked_cq is set we only allow to post CQEs from the original - * task context. Usual request completions will be handled in other - * generic paths but multipoll may decide to post extra cqes. - */ - return !(issue_flags & IO_URING_F_IOWQ) || - !(issue_flags & IO_URING_F_MULTISHOT) || - !req->ctx->task_complete; -} - int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -128,181 +125,402 @@ static bool io_net_retry(struct socket *sock, int flags) return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } +static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) +{ + if (kmsg->free_iov) { + kfree(kmsg->free_iov); + kmsg->free_iov_nr = 0; + kmsg->free_iov = NULL; + } +} + static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; - if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) + /* can't recycle, ensure we free the iovec if we have one */ + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_netmsg_iovec_free(hdr); return; + } /* Let normal cleanup path reap it if we fail adding to the cache */ - if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); } } -static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, - unsigned int issue_flags) +static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct io_async_msghdr *hdr; - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->netmsg_cache); - if (entry) { - hdr = container_of(entry, struct io_async_msghdr, cache); - hdr->free_iov = NULL; - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; - } - } + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); + if (!hdr) + return NULL; - if (!io_alloc_async_data(req)) { - hdr = req->async_data; - hdr->free_iov = NULL; - return hdr; + /* If the async data was cached, we might have an iov cached inside. */ + if (hdr->free_iov) + req->flags |= REQ_F_NEED_CLEANUP; + return hdr; +} + +/* assign new iovec to kmsg, if we need to */ +static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, + struct iovec *iov) +{ + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; + if (kmsg->free_iov) + kfree(kmsg->free_iov); + kmsg->free_iov = iov; } - return NULL; } -static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) +static inline void io_mshot_prep_retry(struct io_kiocb *req, + struct io_async_msghdr *kmsg) { - /* ->prep_async is always called from the submission context */ - return io_msg_alloc_async(req, 0); + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + req->flags &= ~REQ_F_BL_EMPTY; + sr->done_io = 0; + sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; } -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg, - unsigned int issue_flags) +#ifdef CONFIG_COMPAT +static int io_compat_msg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + struct compat_msghdr *msg, int ddir) { - struct io_async_msghdr *async_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct compat_iovec __user *uiov; + struct iovec *iov; + int ret, nr_segs; - if (req_has_async_data(req)) - return -EAGAIN; - async_msg = io_msg_alloc_async(req, issue_flags); - if (!async_msg) { - kfree(kmsg->free_iov); - return -ENOMEM; + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - if (async_msg->msg.msg_name) - async_msg->msg.msg_name = &async_msg->addr; - if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) - return -EAGAIN; + if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) + return -EFAULT; + + uiov = compat_ptr(msg->msg_iov); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + if (msg->msg_iovlen == 0) { + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; + } else if (msg->msg_iovlen > 1) { + return -EINVAL; + } else { + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = clen; + } - /* if were using fast_iov, set it to the new one */ - if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { - size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; - async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; + return 0; } - return -EAGAIN; + ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, + nr_segs, &iov, &iomsg->msg.msg_iter, true); + if (unlikely(ret < 0)) + return ret; + + io_net_vec_assign(req, iomsg, iov); + return 0; +} +#endif + +static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, + struct user_msghdr *msg, int ddir) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct user_msghdr __user *umsg = sr->umsg; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } + + if (!user_access_begin(umsg, sizeof(*umsg))) + return -EFAULT; + + ret = -EFAULT; + unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); + unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); + unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); + unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); + unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); + unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); + msg->msg_flags = 0; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (msg->msg_iovlen == 0) { + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; + } else if (msg->msg_iovlen > 1) { + ret = -EINVAL; + goto ua_end; + } else { + struct iovec __user *uiov = msg->msg_iov; + + /* we only need the length for provided buffers */ + if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) + goto ua_end; + unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); + sr->len = iov->iov_len; + } + ret = 0; +ua_end: + user_access_end(); + return ret; + } + + user_access_end(); + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, + &iov, &iomsg->msg.msg_iter, false); + if (unlikely(ret < 0)) + return ret; + + io_net_vec_assign(req, iomsg, iov); + return 0; } static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct user_msghdr msg; int ret; iomsg->msg.msg_name = &iomsg->addr; - iomsg->free_iov = iomsg->fast_iov; - ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, - &iomsg->free_iov); + iomsg->msg.msg_iter.nr_segs = 0; + +#ifdef CONFIG_COMPAT + if (unlikely(req->ctx->compat)) { + struct compat_msghdr cmsg; + + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + sr->msg_control = iomsg->msg.msg_control_user; + return ret; + } +#endif + + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + ret = __copy_msghdr(&iomsg->msg, &msg, NULL); + /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; } -int io_send_prep_async(struct io_kiocb *req) +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; - int ret; + struct io_async_msghdr *io = req->async_data; - if (!zc->addr || req_has_async_data(req)) - return 0; - io = io_msg_alloc_async_prep(req); - if (!io) - return -ENOMEM; - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); - return ret; + io_netmsg_iovec_free(io); } -static int io_setup_async_addr(struct io_kiocb *req, - struct sockaddr_storage *addr_storage, - unsigned int issue_flags) +static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *kmsg = req->async_data; + void __user *addr; + u16 addr_len; + int ret; - if (!sr->addr || req_has_async_data(req)) - return -EAGAIN; - io = io_msg_alloc_async(req, issue_flags); - if (!io) - return -ENOMEM; - memcpy(&io->addr, addr_storage, sizeof(io->addr)); - return -EAGAIN; + sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (READ_ONCE(sqe->__pad3[0])) + return -EINVAL; + + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_ubuf = NULL; + + addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + addr_len = READ_ONCE(sqe->addr_len); + if (addr) { + ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); + if (unlikely(ret < 0)) + return ret; + kmsg->msg.msg_name = &kmsg->addr; + kmsg->msg.msg_namelen = addr_len; + } + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret < 0)) + return ret; + } + return 0; } -int io_sendmsg_prep_async(struct io_kiocb *req) +static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; int ret; - if (!io_msg_alloc_async_prep(req)) - return -ENOMEM; - ret = io_sendmsg_copy_hdr(req, req->async_data); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); -} +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - if (req->opcode == IORING_OP_SEND) { - if (READ_ONCE(sqe->__pad3[0])) + sr->done_io = 0; + + if (req->opcode != IORING_OP_SEND) { + if (sqe->addr2 || sqe->file_index) return -EINVAL; - sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sr->addr_len = READ_ONCE(sqe->addr_len); - } else if (sqe->addr2 || sqe->file_index) { - return -EINVAL; } - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + if (sr->flags & ~SENDMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_SENDMSG) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + sr->msg_flags |= MSG_WAITALL; + sr->buf_group = req->buf_index; + req->buf_list = NULL; + } #ifdef CONFIG_COMPAT if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - sr->done_io = 0; - return 0; + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG) + return io_send_setup(req, sqe); + return io_sendmsg_setup(req, sqe); +} + +static void io_req_msg_cleanup(struct io_kiocb *req, + unsigned int issue_flags) +{ + io_netmsg_recycle(req, issue_flags); +} + +/* + * For bundle completions, we need to figure out how many segments we consumed. + * A bundle could be using a single ITER_UBUF if that's all we mapped, or it + * could be using an ITER_IOVEC. If the latter, then if we consumed all of + * the segments, then it's a trivial questiont o answer. If we have residual + * data in the iter, then loop the segments to figure out how much we + * transferred. + */ +static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) +{ + struct iovec *iov; + int nbufs; + + /* no data is always zero segments, and a ubuf is always 1 segment */ + if (ret <= 0) + return 0; + if (iter_is_ubuf(&kmsg->msg.msg_iter)) + return 1; + + iov = kmsg->free_iov; + if (!iov) + iov = &kmsg->fast_iov; + + /* if all data was transferred, it's basic pointer math */ + if (!iov_iter_count(&kmsg->msg.msg_iter)) + return iter_iov(&kmsg->msg.msg_iter) - iov; + + /* short transfer, count segments */ + nbufs = 0; + do { + int this_len = min_t(int, iov[nbufs].iov_len, ret); + + nbufs++; + ret -= this_len; + } while (ret); + + return nbufs; +} + +static inline bool io_send_finish(struct io_kiocb *req, int *ret, + struct io_async_msghdr *kmsg, + unsigned issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + bool bundle_finished = *ret <= 0; + unsigned int cflags; + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { + cflags = io_put_kbuf(req, *ret, issue_flags); + goto finish; + } + + cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); + + if (bundle_finished || req->flags & REQ_F_BL_EMPTY) + goto finish; + + /* + * Fill CQE for this receive and see if we should keep trying to + * receive from this socket. + */ + if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { + io_mshot_prep_retry(req, kmsg); + return false; + } + + /* Otherwise stop bundle and use the current result. */ +finish: + io_req_set_res(req, *ret, cflags); + *ret = IOU_OK; + return true; } int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; @@ -312,19 +530,9 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -332,27 +540,25 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg, issue_flags); + req->flags |= REQ_F_BL_NO_RECYCLE; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - io_netmsg_recycle(req, issue_flags); + io_req_msg_cleanup(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -361,67 +567,103 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_send(struct io_kiocb *req, unsigned int issue_flags) +static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, + struct io_async_msghdr *kmsg) { - struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; - struct socket *sock; - unsigned flags; - int min_ret = 0; + int ret; + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .max_len = min_not_zero(sr->len, INT_MAX), + .nr_iovs = 1, + }; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_ubuf = NULL; + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode = KBUF_MODE_FREE; + } - if (sr->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) + arg.nr_iovs = 1; + else + arg.mode |= KBUF_MODE_EXPAND; - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = sr->addr_len; + ret = io_buffers_select(req, &arg, issue_flags); + if (unlikely(ret < 0)) + return ret; + + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; } + sr->len = arg.out_len; - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } else { + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, + arg.iovs, ret, arg.out_len); + } + + return 0; +} + +int io_send(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; + struct socket *sock; + unsigned flags; + int min_ret = 0; + int ret; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); - if (unlikely(ret)) - return ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + +retry_bundle: + if (io_do_buffer_select(req)) { + ret = io_send_select_buffer(req, issue_flags, kmsg); + if (ret) + return ret; + } + + /* + * If MSG_WAITALL is set, or this is a bundle send, then we need + * the full amount. If just bundle is set, if we do a short send + * then we complete the bundle sequence rather than continue on. + */ + if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = flags; + ret = sock_sendmsg(sock, &kmsg->msg); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_addr(req, &__address, issue_flags); + req->flags |= REQ_F_BL_NO_RECYCLE; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -431,178 +673,134 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; -} -static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) -{ - int hdr; + if (!io_send_finish(req, &ret, kmsg, issue_flags)) + goto retry_bundle; - if (iomsg->namelen < 0) - return true; - if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out), - iomsg->namelen, &hdr)) - return true; - if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr)) - return true; - - return false; + io_req_msg_cleanup(req, issue_flags); + return ret; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) +static int io_recvmsg_mshot_prep(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + int namelen, size_t controllen) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr msg; - int ret; - - if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg))) - return -EFAULT; - - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (ret) - return ret; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (msg.msg_iovlen == 0) { - sr->len = iomsg->fast_iov[0].iov_len = 0; - iomsg->fast_iov[0].iov_base = NULL; - iomsg->free_iov = NULL; - } else if (msg.msg_iovlen > 1) { - return -EINVAL; - } else { - if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; - } - - if (req->flags & REQ_F_APOLL_MULTISHOT) { - iomsg->namelen = msg.msg_namelen; - iomsg->controllen = msg.msg_controllen; - if (io_recvmsg_multishot_overflow(iomsg)) - return -EOVERFLOW; - } - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, - false); - if (ret > 0) - ret = 0; + if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == + (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { + int hdr; + + if (unlikely(namelen < 0)) + return -EOVERFLOW; + if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), + namelen, &hdr)) + return -EOVERFLOW; + if (check_add_overflow(hdr, controllen, &hdr)) + return -EOVERFLOW; + + iomsg->namelen = namelen; + iomsg->controllen = controllen; + return 0; } - return ret; + return 0; } -#ifdef CONFIG_COMPAT -static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct compat_msghdr msg; - struct compat_iovec __user *uiov; + struct user_msghdr msg; int ret; - if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) - return -EFAULT; - - ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (ret) - return ret; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->msg.msg_iter.nr_segs = 0; - uiov = compat_ptr(msg.msg_iov); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; +#ifdef CONFIG_COMPAT + if (unlikely(req->ctx->compat)) { + struct compat_msghdr cmsg; - iomsg->free_iov = NULL; - if (msg.msg_iovlen == 0) { - sr->len = 0; - } else if (msg.msg_iovlen > 1) { - return -EINVAL; - } else { - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - } + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); + if (unlikely(ret)) + return ret; - if (req->flags & REQ_F_APOLL_MULTISHOT) { - iomsg->namelen = msg.msg_namelen; - iomsg->controllen = msg.msg_controllen; - if (io_recvmsg_multishot_overflow(iomsg)) - return -EOVERFLOW; - } - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); - if (ret < 0) + ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); + if (unlikely(ret)) return ret; - } - return 0; -} + return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, + cmsg.msg_controllen); + } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); + if (unlikely(ret)) + return ret; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, iomsg); -#endif + ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); + if (unlikely(ret)) + return ret; - return __io_recvmsg_copy_hdr(req, iomsg); + return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, + msg.msg_controllen); } -int io_recvmsg_prep_async(struct io_kiocb *req) +static int io_recvmsg_prep_setup(struct io_kiocb *req) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg; int ret; - if (!io_msg_alloc_async_prep(req)) + kmsg = io_msg_alloc_async(req); + if (unlikely(!kmsg)) return -ENOMEM; - ret = io_recvmsg_copy_hdr(req, req->async_data); + + if (req->opcode == IORING_OP_RECV) { + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_inq = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_get_inq = 1; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_iocb = NULL; + kmsg->msg.msg_ubuf = NULL; + + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + return 0; + } + + ret = io_recvmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ + IORING_RECVSEND_BUNDLE) int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + sr->done_io = 0; + if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~(RECVMSG_FLAGS)) + if (sr->flags & ~RECVMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (sr->flags & IORING_RECV_MULTISHOT) { - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sr->msg_flags & MSG_WAITALL) - return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) - return -EINVAL; - req->flags |= REQ_F_APOLL_MULTISHOT; + if (req->flags & REQ_F_BUFFER_SELECT) { /* * Store the buffer group for this multishot receive separately, * as if we end up doing an io-wq based issue that selects a @@ -612,24 +810,28 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * restore it. */ sr->buf_group = req->buf_index; + req->buf_list = NULL; + } + if (sr->flags & IORING_RECV_MULTISHOT) { + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sr->msg_flags & MSG_WAITALL) + return -EINVAL; + if (req->opcode == IORING_OP_RECV && sr->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_RECVMSG) + return -EINVAL; } #ifdef CONFIG_COMPAT if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - sr->done_io = 0; sr->nr_multishot_loops = 0; - return 0; -} - -static inline void io_recv_prep_retry(struct io_kiocb *req) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - - sr->done_io = 0; - sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; + return io_recvmsg_prep_setup(req); } /* @@ -639,36 +841,36 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, - struct msghdr *msg, bool mshot_finished, - unsigned issue_flags) + struct io_async_msghdr *kmsg, + bool mshot_finished, unsigned issue_flags) { - unsigned int cflags; + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + unsigned int cflags = 0; - cflags = io_put_kbuf(req, issue_flags); - if (msg->msg_inq && msg->msg_inq != -1) + if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, *ret, cflags); - *ret = IOU_OK; - return true; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), + issue_flags); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) + goto finish; + } else { + cflags |= io_put_kbuf(req, *ret, issue_flags); } - if (mshot_finished) - goto finish; - /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - *ret, cflags | IORING_CQE_F_MORE)) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && + io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - io_recv_prep_retry(req); + io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) return false; /* mshot retries exceeded, force a requeue */ @@ -681,7 +883,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, *ret = -EAGAIN; return true; } - /* Otherwise stop multishot but use the current result. */ + + /* Finish the request / stop multishot. */ finish: io_req_set_res(req, *ret, cflags); @@ -689,6 +892,7 @@ finish: *ret = IOU_STOP_MULTISHOT; else *ret = IOU_OK; + io_req_msg_cleanup(req, issue_flags); return true; } @@ -779,7 +983,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -790,21 +994,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; - if (!io_check_multishot(req, issue_flags)) - return io_setup_async_msg(req, kmsg, issue_flags); + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; retry_multishot: if (io_do_buffer_select(req)) { @@ -826,10 +1022,6 @@ retry_multishot: iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); } - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_inq = -1; if (req->flags & REQ_F_APOLL_MULTISHOT) { @@ -846,17 +1038,16 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg, issue_flags); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { + if (issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } - return ret; + return -EAGAIN; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg, issue_flags); + req->flags |= REQ_F_BL_NO_RECYCLE; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -872,74 +1063,116 @@ retry_multishot: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; - if (mshot_finished) { - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - io_netmsg_recycle(req, issue_flags); - req->flags &= ~REQ_F_NEED_CLEANUP; + return ret; +} + +static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, + size_t *len, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + /* + * If the ring isn't locked, then don't use the peek interface + * to grab multiple buffers as we will lock/unlock between + * this selection and posting the buffers. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED) && + sr->flags & IORING_RECVSEND_BUNDLE) { + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .nr_iovs = 1, + .mode = KBUF_MODE_EXPAND, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode |= KBUF_MODE_FREE; + } + + if (kmsg->msg.msg_inq > 0) + arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + + ret = io_buffers_peek(req, &arg); + if (unlikely(ret < 0)) + return ret; + + /* special case 1 vec, can be a fast path */ + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + sr->len = arg.iovs[0].iov_len; + goto map_ubuf; + } + iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + } else { + void __user *buf; + + *len = sr->len; + buf = io_buffer_select(req, len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; + sr->len = *len; +map_ubuf: + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; } - return ret; + return 0; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; size_t len = sr->len; + bool mshot_finished; if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!io_check_multishot(req, issue_flags)) - return -EAGAIN; - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - msg.msg_ubuf = NULL; + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; retry_multishot: if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - sr->len = len; + ret = io_recv_buf_select(req, kmsg, &len, issue_flags); + if (unlikely(ret)) { + kmsg->msg.msg_inq = -1; + goto out_free; + } + sr->buf = NULL; } - ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_inq = -1; - msg.msg_flags = 0; + kmsg->msg.msg_flags = 0; + kmsg->msg.msg_inq = -1; - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = sock_recvmsg(sock, &msg, flags); + ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { if (issue_flags & IO_URING_F_MULTISHOT) { @@ -953,17 +1186,18 @@ retry_multishot: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } + mshot_finished = ret <= 0; if (ret > 0) ret += sr->done_io; else if (sr->done_io) @@ -971,7 +1205,7 @@ out_free: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; return ret; @@ -980,14 +1214,10 @@ out_free: void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *io = req->async_data; - if (req_has_async_data(req)) { - io = req->async_data; - /* might be ->fast_iov if *msg_copy_hdr failed */ - if (io->free_iov != io->fast_iov) - kfree(io->free_iov); - } + if (req_has_async_data(req)) + io_netmsg_iovec_free(io); if (zc->notif) { io_notif_flush(zc->notif); zc->notif = NULL; @@ -1003,6 +1233,9 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *notif; + zc->done_io = 0; + req->flags |= REQ_F_POLL_NO_LAZY; + if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ @@ -1022,56 +1255,46 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->flags & ~IO_ZC_FLAGS_VALID) return -EINVAL; if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { - io_notif_set_extended(notif); - io_notif_to_data(notif)->zc_report = true; - } - } - - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - unsigned idx = READ_ONCE(sqe->buf_index); + struct io_notif_data *nd = io_notif_to_data(notif); - if (unlikely(idx >= ctx->nr_user_bufs)) - return -EFAULT; - idx = array_index_nospec(idx, ctx->nr_user_bufs); - req->imu = READ_ONCE(ctx->user_bufs[idx]); - io_req_set_rsrc_node(notif, ctx, 0); + nd->zc_report = true; + nd->zc_used = false; + nd->zc_copied = false; + } } - if (req->opcode == IORING_OP_SEND_ZC) { - if (READ_ONCE(sqe->__pad3[0])) - return -EINVAL; - zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - zc->addr_len = READ_ONCE(sqe->addr_len); - } else { + if (req->opcode != IORING_OP_SEND_ZC) { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) return -EINVAL; } - zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); - zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; + zc->buf_index = READ_ONCE(sqe->buf_index); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - zc->done_io = 0; - #ifdef CONFIG_COMPAT if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG_ZC) + return io_send_setup(req, sqe); + return io_sendmsg_setup(req, sqe); } -static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, +static int io_sg_from_iter_iovec(struct sk_buff *skb, struct iov_iter *from, size_t length) { skb_zcopy_downgrade_managed(skb); - return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + return zerocopy_fill_skb_from_iter(skb, from, length); } -static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, +static int io_sg_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -1084,7 +1307,7 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, if (!frag) shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; else if (unlikely(!skb_zcopy_managed(skb))) - return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + return zerocopy_fill_skb_from_iter(skb, from, length); bi.bi_size = min(from->count, length); bi.bi_bvec_done = from->iov_offset; @@ -1111,22 +1334,54 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->data_len += copied; skb->len += copied; skb->truesize += truesize; + return ret; +} - if (sk && sk->sk_type == SOCK_STREAM) { - sk_wmem_queued_add(sk, truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_charge(sk, truesize); +static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; + int ret; + + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); + if (node) { + io_req_assign_buf_node(sr->notif, node); + ret = 0; + } + io_ring_submit_unlock(ctx, issue_flags); + + if (unlikely(ret)) + return ret; + + ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, + node->buf, (u64)(uintptr_t)sr->buf, + sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter; } else { - refcount_add(truesize, &skb->sk->sk_wmem_alloc); + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + ret = io_notif_account_mem(sr->notif, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; } + return ret; } int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1137,67 +1392,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - if (zc->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = zc->addr_len; - } - if (!(req->flags & REQ_F_POLLED) && (zc->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, - (u64)(uintptr_t)zc->buf, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter; - } else { - io_notif_set_extended(zc->notif); - ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); + if (!zc->done_io) { + ret = io_send_zc_import(req, issue_flags); if (unlikely(ret)) return ret; - ret = io_notif_account_mem(zc->notif, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter_iovec; } - msg_flags = zc->msg_flags | MSG_ZEROCOPY; + msg_flags = zc->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = msg_flags; - msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = msg_flags; + kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; + ret = sock_sendmsg(sock, &kmsg->msg); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return -EAGAIN; - if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { + if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { zc->len -= ret; zc->buf += ret; zc->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_addr(req, &__address, issue_flags); + req->flags |= REQ_F_BL_NO_RECYCLE; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1215,7 +1440,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + zc->notif = NULL; + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1224,62 +1450,46 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; - io_notif_set_extended(sr->notif); - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; - flags = sr->msg_flags | MSG_ZEROCOPY; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg, issue_flags); + req->flags |= REQ_F_BL_NO_RECYCLE; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov = NULL; - } - io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -1291,7 +1501,8 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + sr->notif = NULL; + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1301,7 +1512,7 @@ void io_sendrecv_fail(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - if (req->flags & REQ_F_PARTIAL_IO) + if (sr->done_io) req->cqe.res = sr->done_io; if ((req->flags & REQ_F_NEED_CLEANUP) && @@ -1309,10 +1520,12 @@ void io_sendrecv_fail(struct io_kiocb *req) req->cqe.flags |= IORING_CQE_F_MORE; } +#define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ + IORING_ACCEPT_POLL_FIRST) + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); - unsigned flags; if (sqe->len || sqe->buf_index) return -EINVAL; @@ -1321,15 +1534,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) + accept->iou_flags = READ_ONCE(sqe->ioprio); + if (accept->iou_flags & ~ACCEPT_FLAGS) return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); if (accept->file_slot) { if (accept->flags & SOCK_CLOEXEC) return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && accept->file_slot != IORING_FILE_INDEX_ALLOC) return -EINVAL; } @@ -1337,8 +1550,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) req->flags |= REQ_F_APOLL_MULTISHOT; + if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) + req->flags |= REQ_F_NOWAIT; return 0; } @@ -1346,26 +1561,34 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; bool fixed = !!accept->file_slot; + struct proto_accept_arg arg = { + .flags = force_nonblock ? O_NONBLOCK : 0, + }; struct file *file; + unsigned cflags; int ret, fd; - if (!io_check_multishot(req, issue_flags)) + if (!(req->flags & REQ_F_POLLED) && + accept->iou_flags & IORING_ACCEPT_POLL_FIRST) return -EAGAIN; + retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) return fd; } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + arg.err = 0; + arg.is_empty = -1; + file = do_accept(req->file, &arg, accept->addr, accept->addr_len, accept->flags); if (IS_ERR(file)) { if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { + if (ret == -EAGAIN && force_nonblock && + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { /* * if it's multishot and polled, we don't need to * return EAGAIN to arm the poll infra since it @@ -1386,18 +1609,26 @@ retry: accept->file_slot); } + cflags = 0; + if (!arg.is_empty) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, 0); + io_req_set_res(req, ret, cflags); return IOU_OK; } if (ret < 0) return ret; - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, IORING_CQE_F_MORE)) - goto retry; + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) + goto retry; + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; + } - io_req_set_res(req, ret, 0); + io_req_set_res(req, ret, cflags); return IOU_STOP_MULTISHOT; } @@ -1455,17 +1686,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); + struct io_async_msghdr *io; if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -1473,32 +1697,31 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); conn->in_progress = conn->seen_econnaborted = false; - return 0; + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); } int io_connect(struct io_kiocb *req, unsigned int issue_flags) { struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); - struct io_async_connect __io, *io; + struct io_async_msghdr *io = req->async_data; unsigned file_flags; int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(connect->addr, - connect->addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; + if (unlikely(req->flags & REQ_F_FAIL)) { + ret = -ECONNRESET; + goto out; } file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->address, - connect->addr_len, file_flags); + ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, + file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) && force_nonblock) { if (ret == -EINPROGRESS) { @@ -1508,13 +1731,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) goto out; connect->seen_econnaborted = true; } - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (connect->in_progress) { @@ -1532,12 +1748,81 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) out: if (ret < 0) req_set_fail(req); + io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); return IOU_OK; } -void io_netmsg_cache_free(struct io_cache_entry *entry) +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct sockaddr __user *uaddr; + struct io_async_msghdr *io; + + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + bind->addr_len = READ_ONCE(sqe->addr2); + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); +} + +int io_bind(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct io_async_msghdr *io = req->async_data; + struct socket *sock; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + + if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) + return -EINVAL; + + listen->backlog = READ_ONCE(sqe->len); + return 0; +} + +int io_listen(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + struct socket *sock; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_listen_socket(sock, listen->backlog); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + +void io_netmsg_cache_free(const void *entry) { - kfree(container_of(entry, struct io_async_msghdr, cache)); + struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; + + if (kmsg->free_iov) + io_netmsg_iovec_free(kmsg); + kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index 191009979bcb..b804c2b36e60 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,46 +3,36 @@ #include <linux/net.h> #include <linux/uio.h> -#include "alloc_cache.h" - struct io_async_msghdr { #if defined(CONFIG_NET) - union { - struct iovec fast_iov[UIO_FASTIOV]; - struct { - struct iovec fast_iov_one; - __kernel_size_t controllen; - int namelen; - __kernel_size_t payloadlen; - }; - struct io_cache_entry cache; - }; - /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; + /* points to an allocated iov, if NULL we use fast_iov instead */ + int free_iov_nr; + struct_group(clear, + int namelen; + struct iovec fast_iov; + __kernel_size_t controllen; + __kernel_size_t payloadlen; + struct sockaddr __user *uaddr; + struct msghdr msg; + struct sockaddr_storage addr; + ); +#else + struct_group(clear); #endif }; #if defined(CONFIG_NET) -struct io_async_connect { - struct sockaddr_storage address; -}; - int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); -int io_sendmsg_prep_async(struct io_kiocb *req); void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); int io_send(struct io_kiocb *req, unsigned int issue_flags); -int io_send_prep_async(struct io_kiocb *req); -int io_recvmsg_prep_async(struct io_kiocb *req); int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); int io_recv(struct io_kiocb *req, unsigned int issue_flags); @@ -55,7 +45,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags); int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_socket(struct io_kiocb *req, unsigned int issue_flags); -int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); @@ -64,9 +53,15 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); -void io_netmsg_cache_free(struct io_cache_entry *entry); +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_bind(struct io_kiocb *req, unsigned int issue_flags); + +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_listen(struct io_kiocb *req, unsigned int issue_flags); + +void io_netmsg_cache_free(const void *entry); #else -static inline void io_netmsg_cache_free(struct io_cache_entry *entry) +static inline void io_netmsg_cache_free(const void *entry) { } #endif diff --git a/io_uring/nop.c b/io_uring/nop.c index d956599a3c1b..5e5196df650a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -8,18 +8,77 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "rsrc.h" #include "nop.h" +struct io_nop { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct file *file; + int result; + int fd; + int buffer; + unsigned int flags; +}; + +#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + nop->flags = READ_ONCE(sqe->nop_flags); + if (nop->flags & ~NOP_FLAGS) + return -EINVAL; + + if (nop->flags & IORING_NOP_INJECT_RESULT) + nop->result = READ_ONCE(sqe->len); + else + nop->result = 0; + if (nop->flags & IORING_NOP_FILE) + nop->fd = READ_ONCE(sqe->fd); + else + nop->fd = -1; + if (nop->flags & IORING_NOP_FIXED_BUFFER) + nop->buffer = READ_ONCE(sqe->buf_index); + else + nop->buffer = -1; return 0; } -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - io_req_set_res(req, 0, 0); + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + int ret = nop->result; + + if (nop->flags & IORING_NOP_FILE) { + if (nop->flags & IORING_NOP_FIXED_FILE) { + req->file = io_file_get_fixed(req, nop->fd, issue_flags); + req->flags |= REQ_F_FIXED_FILE; + } else { + req->file = io_file_get_normal(req, nop->fd); + } + if (!req->file) { + ret = -EBADF; + goto done; + } + } + if (nop->flags & IORING_NOP_FIXED_BUFFER) { + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); + if (node) { + io_req_assign_buf_node(req, node); + ret = 0; + } + io_ring_submit_unlock(ctx, issue_flags); + } +done: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, nop->result, 0); return IOU_OK; } diff --git a/io_uring/notif.c b/io_uring/notif.c index d3e703c37aba..ee3a33510b3c 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,35 +9,36 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) +static const struct ubuf_info_ops io_ubuf_ops; + +static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); - struct io_ring_ctx *ctx = notif->ctx; - if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) - notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + do { + notif = cmd_to_io_kiocb(nd); - if (nd->account_pages && ctx->user) { - __io_unaccount_mem(ctx->user, nd->account_pages); - nd->account_pages = 0; - } - io_req_task_complete(notif, ts); -} + lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); -static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) -{ - struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); - struct io_kiocb *notif = cmd_to_io_kiocb(nd); + if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + + if (nd->account_pages && notif->ctx->user) { + __io_unaccount_mem(notif->ctx->user, nd->account_pages); + nd->account_pages = 0; + } - if (refcount_dec_and_test(&uarg->refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + nd = nd->next; + io_req_task_complete(notif, ts); + } while (nd); } -static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + struct io_kiocb *notif = cmd_to_io_kiocb(nd); + unsigned tw_flags; if (nd->zc_report) { if (success && !nd->zc_used && skb) @@ -45,23 +46,64 @@ static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, else if (!success && !nd->zc_copied) WRITE_ONCE(nd->zc_copied, true); } - io_tx_ubuf_callback(skb, uarg, success); + + if (!refcount_dec_and_test(&uarg->refcnt)) + return; + + if (nd->head != nd) { + io_tx_ubuf_complete(skb, &nd->head->uarg, success); + return; + } + + tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE; + notif->io_task_work.func = io_notif_tw_complete; + __io_req_task_work_add(notif, tw_flags); } -void io_notif_set_extended(struct io_kiocb *notif) +static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) { - struct io_notif_data *nd = io_notif_to_data(notif); + struct io_notif_data *nd, *prev_nd; + struct io_kiocb *prev_notif, *notif; + struct ubuf_info *prev_uarg = skb_zcopy(skb); - if (nd->uarg.callback != io_tx_ubuf_callback_ext) { - nd->account_pages = 0; - nd->zc_report = false; - nd->zc_used = false; - nd->zc_copied = false; - nd->uarg.callback = io_tx_ubuf_callback_ext; - notif->io_task_work.func = io_notif_complete_tw_ext; + nd = container_of(uarg, struct io_notif_data, uarg); + notif = cmd_to_io_kiocb(nd); + + if (!prev_uarg) { + net_zcopy_get(&nd->uarg); + skb_zcopy_init(skb, &nd->uarg); + return 0; } + /* handle it separately as we can't link a notif to itself */ + if (unlikely(prev_uarg == &nd->uarg)) + return 0; + /* we can't join two links together, just request a fresh skb */ + if (unlikely(nd->head != nd || nd->next)) + return -EEXIST; + /* don't mix zc providers */ + if (unlikely(prev_uarg->ops != &io_ubuf_ops)) + return -EEXIST; + + prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); + prev_notif = cmd_to_io_kiocb(nd); + + /* make sure all noifications can be finished in the same task_work */ + if (unlikely(notif->ctx != prev_notif->ctx || + notif->tctx != prev_notif->tctx)) + return -EEXIST; + + nd->head = prev_nd->head; + nd->next = prev_nd->next; + prev_nd->next = nd; + net_zcopy_get(&nd->head->uarg); + return 0; } +static const struct ubuf_info_ops io_ubuf_ops = { + .complete = io_tx_ubuf_complete, + .link_skb = io_link_skb, +}; + struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { @@ -73,14 +115,19 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; - notif->task = current; + notif->tctx = current->io_uring; io_get_task_refs(1); - notif->rsrc_node = NULL; - notif->io_task_work.func = io_req_task_complete; + notif->file_node = NULL; + notif->buf_node = NULL; nd = io_notif_to_data(notif); + nd->zc_report = false; + nd->account_pages = 0; + nd->next = NULL; + nd->head = nd; + nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; - nd->uarg.callback = io_tx_ubuf_callback; + nd->uarg.ops = &io_ubuf_ops; refcount_set(&nd->uarg.refcnt, 1); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 86d32bd9f856..f3589cfef4a9 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -13,14 +13,19 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; - unsigned long account_pages; + + struct io_notif_data *next; + struct io_notif_data *head; + + unsigned account_pages; bool zc_report; bool zc_used; bool zc_copied; }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); -void io_notif_set_extended(struct io_kiocb *notif); +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { @@ -32,9 +37,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + io_tx_ubuf_complete(NULL, &nd->uarg, true); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index b1ee3a9c3807..e8baef4e5146 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include "io_uring.h" #include "opdef.h" @@ -35,6 +36,7 @@ #include "rw.h" #include "waitid.h" #include "futex.h" +#include "truncate.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -66,7 +68,8 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv, .issue = io_read, }, [IORING_OP_WRITEV] = { @@ -80,7 +83,8 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev, .issue = io_write, }, [IORING_OP_FSYNC] = { @@ -98,7 +102,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_read_fixed, .issue = io_read, }, [IORING_OP_WRITE_FIXED] = { @@ -111,7 +116,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_write_fixed, .issue = io_write, }, [IORING_OP_POLL_ADD] = { @@ -137,8 +143,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, #else @@ -151,8 +157,8 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, #else @@ -161,6 +167,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_timeout_prep, .issue = io_timeout, }, @@ -190,6 +197,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -198,6 +206,7 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_connect_prep, .issue = io_connect, #else @@ -238,7 +247,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_read, .issue = io_read, }, [IORING_OP_WRITE] = { @@ -251,7 +261,8 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_write, .issue = io_write, }, [IORING_OP_FADVISE] = { @@ -271,8 +282,9 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, + .buffer_select = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, #else @@ -287,6 +299,7 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recv, #else @@ -402,6 +415,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = sizeof(struct io_uring_cmd_data), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -411,8 +425,8 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, #else @@ -424,8 +438,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else @@ -438,10 +452,12 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .audit_skip = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_read_mshot_prep, .issue = io_read_mshot, }, [IORING_OP_WAITID] = { + .async_size = sizeof(struct io_waitid_async), .prep = io_waitid_prep, .issue = io_waitid, }, @@ -474,6 +490,32 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_install_fixed_fd_prep, .issue = io_install_fixed_fd, }, + [IORING_OP_FTRUNCATE] = { + .needs_file = 1, + .hash_reg_file = 1, + .prep = io_ftruncate_prep, + .issue = io_ftruncate, + }, + [IORING_OP_BIND] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_bind_prep, + .issue = io_bind, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_LISTEN] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_listen_prep, + .issue = io_listen, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -481,16 +523,12 @@ const struct io_cold_def io_cold_defs[] = { .name = "NOP", }, [IORING_OP_READV] = { - .async_size = sizeof(struct io_async_rw), .name = "READV", - .prep_async = io_readv_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITEV", - .prep_async = io_writev_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, @@ -498,13 +536,13 @@ const struct io_cold_def io_cold_defs[] = { .name = "FSYNC", }, [IORING_OP_READ_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "READ_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { @@ -519,8 +557,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG] = { .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -528,14 +564,11 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECVMSG] = { .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_recvmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "TIMEOUT", }, [IORING_OP_TIMEOUT_REMOVE] = { @@ -548,15 +581,10 @@ const struct io_cold_def io_cold_defs[] = { .name = "ASYNC_CANCEL", }, [IORING_OP_LINK_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "LINK_TIMEOUT", }, [IORING_OP_CONNECT] = { .name = "CONNECT", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), - .prep_async = io_connect_prep_async, -#endif }, [IORING_OP_FALLOCATE] = { .name = "FALLOCATE", @@ -576,13 +604,13 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { - .async_size = sizeof(struct io_async_rw), .name = "READ", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { @@ -594,14 +622,14 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SEND] = { .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, #endif }, [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, @@ -614,6 +642,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_SPLICE] = { .name = "SPLICE", + .cleanup = io_splice_cleanup, }, [IORING_OP_PROVIDE_BUFFERS] = { .name = "PROVIDE_BUFFERS", @@ -623,6 +652,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_TEE] = { .name = "TEE", + .cleanup = io_splice_cleanup, }, [IORING_OP_SHUTDOWN] = { .name = "SHUTDOWN", @@ -672,14 +702,10 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", - .async_size = 2 * sizeof(struct io_uring_sqe), - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_send_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif @@ -687,18 +713,16 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG_ZC] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WAITID] = { .name = "WAITID", - .async_size = sizeof(struct io_waitid_async), }, [IORING_OP_FUTEX_WAIT] = { .name = "FUTEX_WAIT", @@ -712,6 +736,15 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FIXED_FD_INSTALL] = { .name = "FIXED_FD_INSTALL", }, + [IORING_OP_FTRUNCATE] = { + .name = "FTRUNCATE", + }, + [IORING_OP_BIND] = { + .name = "BIND", + }, + [IORING_OP_LISTEN] = { + .name = "LISTEN", + }, }; const char *io_uring_get_opcode(u8 opcode) @@ -721,6 +754,14 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_uring_op_supported(u8 opcode) +{ + if (opcode < IORING_OP_LAST && + io_issue_defs[opcode].prep != io_eopnotsupp_prep) + return true; + return false; +} + void __init io_uring_optable_init(void) { int i; diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 9e5435ec27d0..14456436ff74 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -17,8 +17,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; /* supports ioprio */ @@ -27,22 +25,19 @@ struct io_issue_def { unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; - /* opcode specific path will handle ->async_data allocation if needed */ - unsigned manual_alloc : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* size of async data needed, if any */ + unsigned short async_size; + int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); }; struct io_cold_def { - /* size of async data needed, if any */ - unsigned short async_size; - const char *name; - int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; @@ -50,5 +45,7 @@ struct io_cold_def { extern const struct io_issue_def io_issue_defs[]; extern const struct io_cold_def io_cold_defs[]; +bool io_uring_op_supported(u8 opcode); + void io_uring_optable_init(void); #endif diff --git a/io_uring/poll.c b/io_uring/poll.c index 7513afc7b702..bb1c0cd4f809 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -14,7 +14,9 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "alloc_cache.h" #include "refs.h" +#include "napi.h" #include "opdef.h" #include "kbuf.h" #include "poll.h" @@ -120,53 +122,12 @@ static void io_poll_req_insert(struct io_kiocb *req) { struct io_hash_table *table = &req->ctx->cancel_table; u32 index = hash_long(req->cqe.user_data, table->hash_bits); - struct io_hash_bucket *hb = &table->hbs[index]; - - spin_lock(&hb->lock); - hlist_add_head(&req->hash_node, &hb->list); - spin_unlock(&hb->lock); -} - -static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - struct io_hash_table *table = &req->ctx->cancel_table; - u32 index = hash_long(req->cqe.user_data, table->hash_bits); - spinlock_t *lock = &table->hbs[index].lock; - - spin_lock(lock); - hash_del(&req->hash_node); - spin_unlock(lock); -} - -static void io_poll_req_insert_locked(struct io_kiocb *req) -{ - struct io_hash_table *table = &req->ctx->cancel_table_locked; - u32 index = hash_long(req->cqe.user_data, table->hash_bits); lockdep_assert_held(&req->ctx->uring_lock); hlist_add_head(&req->hash_node, &table->hbs[index].list); } -static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (req->flags & REQ_F_HASH_LOCKED) { - /* - * ->cancel_table_locked is protected by ->uring_lock in - * contrast to per bucket spinlocks. Likely, tctx_task_work() - * already grabbed the mutex for us, but there is a chance it - * failed. - */ - io_tw_lock(ctx, ts); - hash_del(&req->hash_node); - req->flags &= ~REQ_F_HASH_LOCKED; - } else { - io_poll_req_delete(req, ctx); - } -} - static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; @@ -263,8 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + if (unlikely(io_should_terminate_tw())) return -ECANCELED; do { @@ -313,6 +273,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REISSUE; } } + if (unlikely(req->cqe.res & EPOLLERR)) + req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; @@ -321,8 +283,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_fill_cqe_req_aux(req, ts->locked, mask, - IORING_CQE_F_MORE)) { + if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } @@ -343,9 +304,10 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) * Release all references, retry if someone tried to restart * task_work while we were executing it. */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) & - IO_POLL_REF_MASK); + v &= IO_POLL_REF_MASK; + } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); + io_napi_add(req); return IOU_POLL_NO_ACTION; } @@ -355,13 +317,16 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) { + io_kbuf_recycle(req, 0); return; } else if (ret == IOU_POLL_REQUEUE) { + io_kbuf_recycle(req, 0); __io_poll_execute(req, 0); return; } io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, ts); + /* task_work always has ->uring_lock held */ + hash_del(&req->hash_node); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { @@ -539,14 +504,6 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) { - /* - * Exclusive waits may only wake a limited amount of entries - * rather than all of them, this may interfere with lazy - * wake if someone does wait(events > 1). Ensure we don't do - * lazy wake for those, as we need to process each one as they - * come in. - */ - req->flags |= REQ_F_POLL_NO_LAZY; add_wait_queue_exclusive(head, &poll->wait); } else { add_wait_queue(head, &poll->wait); @@ -569,12 +526,13 @@ static bool io_poll_can_finish_inline(struct io_kiocb *req, return pt->owning || io_poll_get_ownership(req); } -static void io_poll_add_hash(struct io_kiocb *req) +static void io_poll_add_hash(struct io_kiocb *req, unsigned int issue_flags) { - if (req->flags & REQ_F_HASH_LOCKED) - io_poll_req_insert_locked(req); - else - io_poll_req_insert(req); + struct io_ring_ctx *ctx = req->ctx; + + io_ring_submit_lock(ctx, issue_flags); + io_poll_req_insert(req); + io_ring_submit_unlock(ctx, issue_flags); } /* @@ -588,10 +546,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll_table *ipt, __poll_t mask, unsigned issue_flags) { - struct io_ring_ctx *ctx = req->ctx; - INIT_HLIST_NODE(&req->hash_node); - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; @@ -614,9 +569,15 @@ static int __io_arm_poll_handler(struct io_kiocb *req, ipt->owning = issue_flags & IO_URING_F_UNLOCKED; atomic_set(&req->poll_refs, (int)ipt->owning); - /* io-wq doesn't hold uring_lock */ - if (issue_flags & IO_URING_F_UNLOCKED) - req->flags &= ~REQ_F_HASH_LOCKED; + /* + * Exclusive waits may only wake a limited amount of entries + * rather than all of them, this may interfere with lazy + * wake if someone does wait(events > 1). Ensure we don't do + * lazy wake for those, as we need to process each one as they + * come in. + */ + if (poll->events & EPOLLEXCLUSIVE) + req->flags |= REQ_F_POLL_NO_LAZY; mask = vfs_poll(req->file, &ipt->pt) & poll->events; @@ -636,7 +597,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { if (!io_poll_can_finish_inline(req, ipt)) { - io_poll_add_hash(req); + io_poll_add_hash(req, issue_flags); return 0; } io_poll_remove_entries(req); @@ -645,13 +606,14 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 1; } - io_poll_add_hash(req); + io_poll_add_hash(req, issue_flags); if (mask && (poll->events & EPOLLET) && io_poll_can_finish_inline(req, ipt)) { __io_poll_execute(req, mask); return 0; } + io_napi_add(req); if (ipt->owning) { /* @@ -685,22 +647,17 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->apoll_cache); - if (entry == NULL) - goto alloc_apoll; - apoll = container_of(entry, struct async_poll, cache); - apoll->poll.retries = APOLL_MAX_RETRY; } else { -alloc_apoll: - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) + if (!(issue_flags & IO_URING_F_UNLOCKED)) + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC); + else + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } @@ -719,15 +676,9 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; - /* - * apoll requests already grab the mutex to complete in the tw handler, - * so removal from the mutex-backed hash is free, use it by default. - */ - req->flags |= REQ_F_HASH_LOCKED; - if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; - if (!file_can_poll(req->file)) + if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; @@ -760,97 +711,69 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } -static __cold bool io_poll_remove_all_table(struct task_struct *tsk, - struct io_hash_table *table, - bool cancel_all) +/* + * Returns true if we found and killed one or more poll requests + */ +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + bool cancel_all) { - unsigned nr_buckets = 1U << table->hash_bits; + unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; + lockdep_assert_held(&ctx->uring_lock); + for (i = 0; i < nr_buckets; i++) { - struct io_hash_bucket *hb = &table->hbs[i]; + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; - spin_lock(&hb->lock); hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { + if (io_match_task_safe(req, tctx, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } - spin_unlock(&hb->lock); } return found; } -/* - * Returns true if we found and killed one or more poll requests - */ -__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) - __must_hold(&ctx->uring_lock) -{ - bool ret; - - ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all); - ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); - return ret; -} - static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd, - struct io_hash_table *table, - struct io_hash_bucket **out_bucket) + struct io_cancel_data *cd) { struct io_kiocb *req; - u32 index = hash_long(cd->data, table->hash_bits); - struct io_hash_bucket *hb = &table->hbs[index]; - - *out_bucket = NULL; + u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits); + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index]; - spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { - if (cd->seq == req->work.cancel_seq) + if (io_cancel_match_sequence(req, cd->seq)) continue; - req->work.cancel_seq = cd->seq; } - *out_bucket = hb; return req; } - spin_unlock(&hb->lock); return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd, - struct io_hash_table *table, - struct io_hash_bucket **out_bucket) + struct io_cancel_data *cd) { - unsigned nr_buckets = 1U << table->hash_bits; + unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct io_kiocb *req; int i; - *out_bucket = NULL; - for (i = 0; i < nr_buckets; i++) { - struct io_hash_bucket *hb = &table->hbs[i]; + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; - spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { - if (io_cancel_req_match(req, cd)) { - *out_bucket = hb; + if (io_cancel_req_match(req, cd)) return req; - } } - spin_unlock(&hb->lock); } return NULL; } @@ -866,23 +789,21 @@ static int io_poll_disarm(struct io_kiocb *req) return 0; } -static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, - struct io_hash_table *table) +static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { - struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd, table, &bucket); + req = io_poll_file_find(ctx, cd); else - req = io_poll_find(ctx, false, cd, table, &bucket); + req = io_poll_find(ctx, false, cd); - if (req) + if (req) { io_poll_cancel_req(req); - if (bucket) - spin_unlock(&bucket->lock); - return req ? 0 : -ENOENT; + return 0; + } + return -ENOENT; } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, @@ -890,12 +811,8 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, { int ret; - ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table); - if (ret != -ENOENT) - return ret; - io_ring_submit_lock(ctx, issue_flags); - ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked); + ret = __io_poll_cancel(ctx, cd); io_ring_submit_unlock(ctx, issue_flags); return ret; } @@ -972,13 +889,6 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; - /* - * If sqpoll or single issuer, there is no contention for ->uring_lock - * and we'll end up holding it in tw handlers anyway. - */ - if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER)) - req->flags |= REQ_F_HASH_LOCKED; - ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); @@ -992,33 +902,16 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_ring_ctx *ctx = req->ctx; struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; - struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - struct io_tw_state ts = { .locked = true }; io_ring_submit_lock(ctx, issue_flags); - preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); + preq = io_poll_find(ctx, true, &cd); ret2 = io_poll_disarm(preq); - if (bucket) - spin_unlock(&bucket->lock); - if (!ret2) - goto found; - if (ret2 != -ENOENT) { - ret = ret2; - goto out; - } - - preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); - ret2 = io_poll_disarm(preq); - if (bucket) - spin_unlock(&bucket->lock); if (ret2) { ret = ret2; goto out; } - -found: if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { ret = -EFAULT; goto out; @@ -1044,7 +937,8 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - io_req_task_complete(preq, &ts); + preq->io_task_work.func = io_req_task_complete; + io_req_task_work_add(preq); out: io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { @@ -1055,8 +949,3 @@ out: io_req_set_res(req, ret, 0); return IOU_OK; } - -void io_apoll_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct async_poll, cache)); -} diff --git a/io_uring/poll.h b/io_uring/poll.h index 1dacae9e816c..04ede93113dc 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "alloc_cache.h" +#define IO_POLL_ALLOC_CACHE_MAX 32 enum { IO_APOLL_OK, @@ -17,10 +17,7 @@ struct io_poll { }; struct async_poll { - union { - struct io_poll poll; - struct io_cache_entry cache; - }; + struct io_poll poll; struct io_poll *double_poll; }; @@ -43,9 +40,7 @@ struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); -bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, +bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_apoll_cache_free(struct io_cache_entry *entry); - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/refs.h b/io_uring/refs.h index 1336de3f2a30..63982ead9f7d 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -33,6 +33,13 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_dec(&req->refs); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { diff --git a/io_uring/register.c b/io_uring/register.c index 5e62c1208996..9a4d2fbce4ae 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -26,65 +26,14 @@ #include "register.h" #include "cancel.h" #include "kbuf.h" +#include "napi.h" +#include "eventfd.h" +#include "msg_ring.h" +#include "memmap.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - return 0; -} - -int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); - return 0; - } - - return -ENXIO; -} - static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -92,9 +41,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, size_t size; int i, ret; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; @@ -107,12 +57,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, goto out; p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) + if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; @@ -156,21 +104,13 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) +static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, + struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; @@ -182,47 +122,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, if (IS_ERR(res)) return PTR_ERR(res); - ret = 0; + ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); + if (res[i].register_op >= IORING_REGISTER_LAST) + goto err; + __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + if (res[i].sqe_op >= IORING_OP_LAST) + goto err; + __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + restrictions->sqe_flags_required = res[i].sqe_flags; break; default: - ret = -EINVAL; - goto out; + goto err; } } -out: + ret = 0; + +err: + kfree(res); + return ret; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + int ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; - - kfree(res); return ret; } @@ -354,8 +304,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, } if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) @@ -367,8 +319,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - + tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; @@ -380,12 +331,300 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; err: if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); + } + return ret; +} + +static int io_register_clock(struct io_ring_ctx *ctx, + struct io_uring_clock_register __user *arg) +{ + struct io_uring_clock_register reg; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + + switch (reg.clockid) { + case CLOCK_MONOTONIC: + ctx->clock_offset = 0; + break; + case CLOCK_BOOTTIME: + ctx->clock_offset = TK_OFFS_BOOT; + break; + default: + return -EINVAL; + } + + ctx->clockid = reg.clockid; + return 0; +} + +/* + * State to maintain until we can swap. Both new and old state, used for + * either mapping or freeing. + */ +struct io_ring_ctx_rings { + struct io_rings *rings; + struct io_uring_sqe *sq_sqes; + + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; +}; + +static void io_register_free_rings(struct io_ring_ctx *ctx, + struct io_uring_params *p, + struct io_ring_ctx_rings *r) +{ + io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); +} + +#define swap_old(ctx, o, n, field) \ + do { \ + (o).field = (ctx)->field; \ + (ctx)->field = (n).field; \ + } while (0) + +#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) +#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + +static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_region_desc rd; + struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; + size_t size, sq_array_offset; + unsigned i, tail, old_head; + struct io_uring_params p; + int ret; + + /* for single issuer, must be owner resizing */ + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && + current != ctx->submitter_task) + return -EEXIST; + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + if (p.flags & ~RESIZE_FLAGS) + return -EINVAL; + + /* properties that are always inherited */ + p.flags |= (ctx->flags & COPY_FLAGS); + + ret = io_uring_fill_params(p.sq_entries, &p); + if (unlikely(ret)) + return ret; + + /* nothing to do, but copy params back */ + if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { + if (copy_to_user(arg, &p, sizeof(p))) + return -EFAULT; + return 0; + } + + size = rings_size(p.flags, p.sq_entries, p.cq_entries, + &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); + + /* + * At this point n.rings is shared with userspace, just like o.rings + * is as well. While we don't expect userspace to modify it while + * a resize is in progress, and it's most likely that userspace will + * shoot itself in the foot if it does, we can't always assume good + * intent... Use read/write once helpers from here on to indicate the + * shared nature of it. + */ + WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); + WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); + WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); + WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); + + if (copy_to_user(arg, &p, sizeof(p))) { + io_register_free_rings(ctx, &p, &n); + return -EFAULT; + } + + if (p.flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); + if (size == SIZE_MAX) { + io_register_free_rings(ctx, &p, &n); + return -EOVERFLOW; + } + + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.sq_sqes = io_region_get_ptr(&n.sq_region); + + /* + * If using SQPOLL, park the thread + */ + if (ctx->sq_data) { + mutex_unlock(&ctx->uring_lock); + io_sq_thread_park(ctx->sq_data); + mutex_lock(&ctx->uring_lock); } + + /* + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude + * any new mmap's on the ring fd. Clear out existing mappings to prevent + * mmap from seeing them, as we'll unmap them. Any attempt to mmap + * existing rings beyond this point will fail. Not that it could proceed + * at this point anyway, as the io_uring mmap side needs go grab the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the + * duration of the actual swap. + */ + mutex_lock(&ctx->mmap_lock); + spin_lock(&ctx->completion_lock); + o.rings = ctx->rings; + ctx->rings = NULL; + o.sq_sqes = ctx->sq_sqes; + ctx->sq_sqes = NULL; + + /* + * Now copy SQ and CQ entries, if any. If either of the destination + * rings can't hold what is already there, then fail the operation. + */ + tail = READ_ONCE(o.rings->sq.tail); + old_head = READ_ONCE(o.rings->sq.head); + if (tail - old_head > p.sq_entries) + goto overflow; + for (i = old_head; i < tail; i++) { + unsigned src_head = i & (ctx->sq_entries - 1); + unsigned dst_head = i & (p.sq_entries - 1); + + n.sq_sqes[dst_head] = o.sq_sqes[src_head]; + } + WRITE_ONCE(n.rings->sq.head, old_head); + WRITE_ONCE(n.rings->sq.tail, tail); + + tail = READ_ONCE(o.rings->cq.tail); + old_head = READ_ONCE(o.rings->cq.head); + if (tail - old_head > p.cq_entries) { +overflow: + /* restore old rings, and return -EOVERFLOW via cleanup path */ + ctx->rings = o.rings; + ctx->sq_sqes = o.sq_sqes; + to_free = &n; + ret = -EOVERFLOW; + goto out; + } + for (i = old_head; i < tail; i++) { + unsigned src_head = i & (ctx->cq_entries - 1); + unsigned dst_head = i & (p.cq_entries - 1); + + n.rings->cqes[dst_head] = o.rings->cqes[src_head]; + } + WRITE_ONCE(n.rings->cq.head, old_head); + WRITE_ONCE(n.rings->cq.tail, tail); + /* invalidate cached cqe refill */ + ctx->cqe_cached = ctx->cqe_sentinel = NULL; + + WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); + atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); + WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); + WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); + + /* all done, store old pointers and assign new ones */ + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + + ctx->sq_entries = p.sq_entries; + ctx->cq_entries = p.cq_entries; + + ctx->rings = n.rings; + ctx->sq_sqes = n.sq_sqes; + swap_old(ctx, o, n, ring_region); + swap_old(ctx, o, n, sq_region); + to_free = &o; + ret = 0; +out: + spin_unlock(&ctx->completion_lock); + mutex_unlock(&ctx->mmap_lock); + io_register_free_rings(ctx, &p, to_free); + + if (ctx->sq_data) + io_sq_thread_unpark(ctx->sq_data); + return ret; } +static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_mem_region_reg __user *reg_uptr = uarg; + struct io_uring_mem_region_reg reg; + struct io_uring_region_desc __user *rd_uptr; + struct io_uring_region_desc rd; + int ret; + + if (io_region_is_set(&ctx->param_region)) + return -EBUSY; + if (copy_from_user(®, reg_uptr, sizeof(reg))) + return -EFAULT; + rd_uptr = u64_to_user_ptr(reg.region_uptr); + if (copy_from_user(&rd, rd_uptr, sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) + return -EINVAL; + + /* + * This ensures there are no waiters. Waiters are unlocked and it's + * hard to synchronise with them, especially if we need to initialise + * the region. + */ + if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EINVAL; + + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, + IORING_MAP_OFF_PARAM_REGION); + if (ret) + return ret; + if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { + io_free_region(ctx, &ctx->param_region); + return -EFAULT; + } + + if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { + ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_size = rd.size; + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -550,6 +789,42 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_pbuf_status(ctx, arg); break; + case IORING_REGISTER_NAPI: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_napi(ctx, arg); + break; + case IORING_UNREGISTER_NAPI: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_unregister_napi(ctx, arg); + break; + case IORING_REGISTER_CLOCK: + ret = -EINVAL; + if (!arg || nr_args) + break; + ret = io_register_clock(ctx, arg); + break; + case IORING_REGISTER_CLONE_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_clone_buffers(ctx, arg); + break; + case IORING_REGISTER_RESIZE_RINGS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_resize_rings(ctx, arg); + break; + case IORING_REGISTER_MEM_REGION: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_mem_region(ctx, arg); + break; default: ret = -EINVAL; break; @@ -558,21 +833,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return ret; } -SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, - void __user *, arg, unsigned int, nr_args) +/* + * Given an 'fd' value, return the ctx associated with if. If 'registered' is + * true, then the registered index is used. Otherwise, the normal fd table. + * Caller must call fput() on the returned file, unless it's an ERR_PTR. + */ +struct file *io_uring_register_get_file(unsigned int fd, bool registered) { - struct io_ring_ctx *ctx; - long ret = -EBADF; struct file *file; - bool use_registered_ring; - use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); - opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; - - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; - - if (use_registered_ring) { + if (registered) { /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. @@ -580,28 +850,78 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; + return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; + if (file) + get_file(file); } else { file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(file)) - goto out_fput; } + if (unlikely(!file)) + return ERR_PTR(-EBADF); + if (io_is_uring_fops(file)) + return file; + fput(file); + return ERR_PTR(-EOPNOTSUPP); +} + +/* + * "blind" registration opcodes are ones where there's no ring given, and + * hence the source fd must be -1. + */ +static int io_uring_register_blind(unsigned int opcode, void __user *arg, + unsigned int nr_args) +{ + switch (opcode) { + case IORING_REGISTER_SEND_MSG_RING: { + struct io_uring_sqe sqe; + + if (!arg || nr_args != 1) + return -EINVAL; + if (copy_from_user(&sqe, arg, sizeof(sqe))) + return -EFAULT; + /* no flags supported */ + if (sqe.flags) + return -EINVAL; + if (sqe.opcode == IORING_OP_MSG_RING) + return io_uring_sync_msg_ring(&sqe); + } + } + + return -EINVAL; +} + +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, + void __user *, arg, unsigned int, nr_args) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + struct file *file; + bool use_registered_ring; + + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; + + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + + if (fd == -1) + return io_uring_register_blind(opcode, arg, nr_args); + + file = io_uring_register_get_file(fd, use_registered_ring); + if (IS_ERR(file)) + return PTR_ERR(file); ctx = file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); + + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, + ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: - if (!use_registered_ring) - fput(file); + + fput(file); return ret; } diff --git a/io_uring/register.h b/io_uring/register.h index c9da997d503c..a5f39d5ef9e0 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,5 +4,6 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); +struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4818b79231dd..af39b69eb4fd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -15,6 +15,8 @@ #include "io_uring.h" #include "openclose.h" #include "rsrc.h" +#include "memmap.h" +#include "register.h" struct io_rsrc_update { struct file *file; @@ -23,21 +25,13 @@ struct io_rsrc_update { u32 offset; }; -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage); +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -static const struct io_mapped_ubuf dummy_ubuf = { - /* set invalid range, so io_import_fixed() fails meeting it */ - .ubuf = -1UL, - .ubuf_end = 0, -}; - int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -83,31 +77,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); @@ -132,223 +101,57 @@ static int io_buffer_validate(struct iovec *iov) return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_mapped_ubuf *imu = *slot; unsigned int i; - if (imu != &dummy_ubuf) { + if (node->buf) { + struct io_mapped_ubuf *imu = node->buf; + + if (!refcount_dec_and_test(&imu->refs)) + return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } - *slot = NULL; -} - -static void io_rsrc_put_work(struct io_rsrc_node *node) -{ - struct io_rsrc_put *prsrc = &node->item; - - if (prsrc->tag) - io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - - switch (node->type) { - case IORING_RSRC_FILE: - fput(prsrc->file); - break; - case IORING_RSRC_BUFFER: - io_rsrc_buf_put(node->ctx, prsrc); - break; - default: - WARN_ON_ONCE(1); - break; - } } -void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +struct io_rsrc_node *io_rsrc_node_alloc(int type) { - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) - kfree(node); -} - -void io_rsrc_node_ref_zero(struct io_rsrc_node *node) - __must_hold(&node->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = node->ctx; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (node->refs) - break; - list_del(&node->node); + struct io_rsrc_node *node; - if (likely(!node->empty)) - io_rsrc_put_work(node); - io_rsrc_node_destroy(ctx, node); + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (node) { + node->type = type; + node->refs = 1; } - if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) - wake_up_all(&ctx->rsrc_quiesce_wq); + return node; } -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { - struct io_rsrc_node *ref_node; - struct io_cache_entry *entry; - - entry = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (entry) { - ref_node = container_of(entry, struct io_rsrc_node, cache); - } else { - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + if (!data->nr) + return; + while (data->nr--) { + if (data->nodes[data->nr]) + io_put_rsrc_node(ctx, data->nodes[data->nr]); } - - ref_node->ctx = ctx; - ref_node->empty = 0; - ref_node->refs = 1; - return ref_node; + kvfree(data->nodes); + data->nodes = NULL; + data->nr = 0; } -__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) +__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) { - struct io_rsrc_node *backup; - DEFINE_WAIT(we); - int ret; - - /* As We may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - backup = io_rsrc_node_alloc(ctx); - if (!backup) - return -ENOMEM; - ctx->rsrc_node->empty = true; - ctx->rsrc_node->type = -1; - list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, ctx->rsrc_node); - ctx->rsrc_node = backup; - - if (list_empty(&ctx->rsrc_ref_list)) + data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (data->nodes) { + data->nr = nr; return 0; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - } - - ctx->rsrc_quiesce++; - data->quiesce = true; - do { - prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); - mutex_unlock(&ctx->uring_lock); - - ret = io_run_task_work_sig(ctx); - if (ret < 0) { - mutex_lock(&ctx->uring_lock); - if (list_empty(&ctx->rsrc_ref_list)) - ret = 0; - break; - } - - schedule(); - __set_current_state(TASK_RUNNING); - mutex_lock(&ctx->uring_lock); - ret = 0; - } while (!list_empty(&ctx->rsrc_ref_list)); - - finish_wait(&ctx->rsrc_quiesce_wq, &we); - data->quiesce = false; - ctx->rsrc_quiesce--; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 0); - smp_mb(); - } - return ret; -} - -static void io_free_page_table(void **table, size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); -} - -static void io_rsrc_data_free(struct io_rsrc_data *data) -{ - size_t size = data->nr * sizeof(data->tags[0][0]); - - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); -} - -static __cold void **io_alloc_page_table(size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; - - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; - - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; } - return table; -} - -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, - u64 __user *utags, - unsigned nr, struct io_rsrc_data **pdata) -{ - struct io_rsrc_data *data; - int ret = 0; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; - } - - data->nr = nr; - data->ctx = ctx; - data->rsrc_type = type; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } - } - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; + return -ENOMEM; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -357,14 +160,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) + if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { @@ -382,19 +183,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (fd == IORING_REGISTER_FILES_SKIP) continue; - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - err = io_queue_rsrc_removal(data, i, - io_slot_file(file_slot)); - if (err) - break; - file_slot->file_ptr = 0; + i = up->offset + done; + if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); - } + if (fd != -1) { struct file *file = fget(fd); + struct io_rsrc_node *node; if (!file) { err = -EBADF; @@ -408,8 +203,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); + node = io_rsrc_node_alloc(IORING_RSRC_FILE); + if (!node) { + err = -ENOMEM; + fput(file); + break; + } + ctx->file_table.data.nodes[i] = node; + if (tag) + node->tag = tag; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } } @@ -421,51 +224,54 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct iovec fast_iov, *iov; struct page *last_hpage = NULL; + struct iovec __user *uvec; + u64 user_data = up->data; __u32 done; int i, err; - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) + if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; u64 tag = 0; - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) + uvec = u64_to_user_ptr(user_data); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + err = PTR_ERR(iov); break; + } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } - err = io_buffer_validate(&iov); + err = io_buffer_validate(iov); if (err) break; - if (!iov.iov_base && tag) { - err = -EINVAL; + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + err = PTR_ERR(node); break; } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != &dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); + if (tag) { + if (!node) { + err = -EINVAL; break; } - ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; + node->tag = tag; } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, i) = tag; + i = array_index_nospec(up->offset + done, ctx->buf_table.nr); + io_reset_rsrc_node(ctx, &ctx->buf_table, i); + ctx->buf_table.nodes[i] = node; + if (ctx->compat) + user_data += sizeof(struct compat_iovec); + else + user_data += sizeof(struct iovec); } return done ? done : err; } @@ -577,7 +383,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, struct file *file; int ret, fd; - if (!req->ctx->file_data) + if (!req->ctx->file_table.data.nr) return -ENXIO; for (done = 0; done < up->nr_args; done++) { @@ -636,65 +442,36 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = data->ctx; - struct io_rsrc_node *node = ctx->rsrc_node; - u64 *tag_slot = io_get_tag_slot(data, idx); - - ctx->rsrc_node = io_rsrc_node_alloc(ctx); - if (unlikely(!ctx->rsrc_node)) { - ctx->rsrc_node = node; - return -ENOMEM; - } - - node->item.rsrc = rsrc; - node->type = data->rsrc_type; - node->item.tag = *tag_slot; - *tag_slot = 0; - list_add_tail(&node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, node); - return 0; -} + if (node->tag) + io_post_aux_cqe(ctx, node->tag, 0, 0); -void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(&ctx->file_table, i); - - if (!file) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); + switch (node->type) { + case IORING_RSRC_FILE: + if (io_slot_file(node)) + fput(io_slot_file(node)); + break; + case IORING_RSRC_BUFFER: + if (node->buf) + io_buffer_unmap(ctx, node); + break; + default: + WARN_ON_ONCE(1); + break; } - io_free_file_tables(&ctx->file_table); - io_file_table_set_alloc_range(ctx, 0, 0); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; + kfree(node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; + io_free_file_tables(ctx, &ctx->file_table); + io_file_table_set_alloc_range(ctx, 0, 0); + return 0; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -705,7 +482,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int fd, ret; unsigned i; - if (ctx->file_data) + if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; @@ -713,28 +490,22 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; + if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; - } - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; + ret = -EFAULT; + if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) + goto fail; + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; - } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) + if (tag) goto fail; continue; } @@ -751,56 +522,33 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto fail; } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); + ret = -ENOMEM; + node = io_rsrc_node_alloc(IORING_RSRC_FILE); + if (!node) { + fput(file); + goto fail; + } + if (tag) + node->tag = tag; + ctx->file_table.data.nodes[i] = node; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ - io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); + io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: - __io_sqe_files_unregister(ctx); + io_sqe_files_unregister(ctx); return ret; } -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; + io_rsrc_data_free(ctx, &ctx->buf_table); + return 0; } /* @@ -826,9 +574,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, } /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + for (i = 0; i < ctx->buf_table.nr; i++) { + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; + struct io_mapped_ubuf *imu; + if (!node) + continue; + imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; @@ -872,56 +624,114 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) +static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) { - unsigned long start, end, nr_pages; - struct page **pages = NULL; - int ret; + struct page **page_array = *pages, **new_array = NULL; + int nr_pages_left = *nr_pages, i, j; + int nr_folios = data->nr_folios; - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - WARN_ON(!nr_pages); + /* Store head pages only*/ + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), + GFP_KERNEL); + if (!new_array) + return false; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); + new_array[0] = compound_head(page_array[0]); + /* + * The pages are bound to the folio, it doesn't + * actually unpin them but drops all but one reference, + * which is usually put down by io_buffer_unmap(). + * Note, needs a better helper. + */ + if (data->nr_pages_head > 1) + unpin_user_pages(&page_array[1], data->nr_pages_head - 1); + + j = data->nr_pages_head; + nr_pages_left -= data->nr_pages_head; + for (i = 1; i < nr_folios; i++) { + unsigned int nr_unpin; + + new_array[i] = page_array[j]; + nr_unpin = min_t(unsigned int, nr_pages_left - 1, + data->nr_pages_mid - 1); + if (nr_unpin) + unpin_user_pages(&page_array[j+1], nr_unpin); + j += data->nr_pages_mid; + nr_pages_left -= data->nr_pages_mid; + } + kvfree(page_array); + *pages = new_array; + *nr_pages = nr_folios; + return true; +} - mmap_read_lock(current->mm); - ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); - mmap_read_unlock(current->mm); +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data) +{ + struct folio *folio = page_folio(page_array[0]); + unsigned int count = 1, nr_folios = 1; + int i; - /* success, mapped all pages */ - if (ret == nr_pages) { - *npages = nr_pages; - return pages; - } + data->nr_pages_mid = folio_nr_pages(folio); + data->folio_shift = folio_shift(folio); - /* partial map, or didn't map anything */ - if (ret >= 0) { - /* if we did partial map, release any pages we did get */ - if (ret) - unpin_user_pages(pages, ret); - ret = -EFAULT; + /* + * Check if pages are contiguous inside a folio, and all folios have + * the same page count except for the head and tail. + */ + for (i = 1; i < nr_pages; i++) { + if (page_folio(page_array[i]) == folio && + page_array[i] == page_array[i-1] + 1) { + count++; + continue; + } + + if (nr_folios == 1) { + if (folio_page_idx(folio, page_array[i-1]) != + data->nr_pages_mid - 1) + return false; + + data->nr_pages_head = count; + } else if (count != data->nr_pages_mid) { + return false; + } + + folio = page_folio(page_array[i]); + if (folio_size(folio) != (1UL << data->folio_shift) || + folio_page_idx(folio, page_array[i]) != 0) + return false; + + count = 1; + nr_folios++; } - kvfree(pages); - return ERR_PTR(ret); + if (nr_folios == 1) + data->nr_pages_head = count; + + data->nr_folios = nr_folios; + return true; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, + struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; + struct io_rsrc_node *node; unsigned long off; size_t size; int ret, nr_pages, i; - struct folio *folio = NULL; + struct io_imu_folio_data data; + bool coalesced = false; - *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) - return 0; + return NULL; + + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + if (!node) + return ERR_PTR(-ENOMEM); + node->buf = NULL; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -932,30 +742,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - /* If it's a huge page, try to coalesce them into a single bvec entry */ - if (nr_pages > 1) { - folio = page_folio(pages[0]); - for (i = 1; i < nr_pages; i++) { - /* - * Pages must be consecutive and on the same folio for - * this to work - */ - if (page_folio(pages[i]) != folio || - pages[i] != pages[i - 1] + 1) { - folio = NULL; - break; - } - } - if (folio) { - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - unpin_user_pages(&pages[1], nr_pages - 1); - nr_pages = 1; - } + /* If it's huge page(s), try to coalesce them into fewer bvec entries */ + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { + if (data.nr_pages_mid != 1) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); @@ -968,91 +758,105 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; - *pimu = imu; + imu->folio_shift = PAGE_SHIFT; + if (coalesced) + imu->folio_shift = data.folio_shift; + refcount_set(&imu->refs, 1); + off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + node->buf = imu; ret = 0; - if (folio) { - bvec_set_page(&imu->bvec[0], pages[0], size, off); - goto done; - } for (i = 0; i < nr_pages; i++) { size_t vec_len; - vec_len = min_t(size_t, size, PAGE_SIZE - off); + vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } done: - if (ret) + if (ret) { kvfree(imu); + if (node) + io_put_rsrc_node(ctx, node); + node = ERR_PTR(ret); + } kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; + return node; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; - struct io_rsrc_data *data; + struct io_rsrc_data data; + struct iovec fast_iov, *iov = &fast_iov; + const struct iovec __user *uvec; int i, ret; - struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - if (ctx->user_bufs) + if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); + ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + if (!arg) + memset(iov, 0, sizeof(*iov)); + + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; + if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) + uvec = (struct iovec __user *) arg; + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + ret = PTR_ERR(iov); break; - ret = io_buffer_validate(&iov); + } + ret = io_buffer_validate(iov); if (ret) break; - } else { - memset(&iov, 0, sizeof(iov)); + if (ctx->compat) + arg += sizeof(struct compat_iovec); + else + arg += sizeof(struct iovec); } - if (!iov.iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; + if (tags) { + if (copy_from_user(&tag, &tags[i], sizeof(tag))) { + ret = -EFAULT; + break; + } } - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + ret = PTR_ERR(node); break; + } + if (tag) { + if (!node) { + ret = -EINVAL; + break; + } + node->tag = tag; + } + data.nodes[i] = node; } - WARN_ON_ONCE(ctx->buf_data); - - ctx->buf_data = data; + ctx->buf_table = data; if (ret) - __io_sqe_buffers_unregister(ctx); + io_sqe_buffers_unregister(ctx); return ret; } @@ -1068,7 +872,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; /* @@ -1076,7 +880,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); if (offset) { /* @@ -1086,40 +890,193 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * we know that: * * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the + * 2) all bvecs are the same in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. + * be folio_size aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - /* - * Note, huge pages buffers consists of one large - * bvec entry and should always go this way. The other - * branch doesn't expect non PAGE_SIZE'd chunks. - */ - iter->bvec = bvec; - iter->nr_segs = bvec->bv_len; - iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; /* skip first vec */ offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); + seg_skip = 1 + (offset >> imu->folio_shift); - iter->bvec = bvec + seg_skip; + iter->bvec += seg_skip; iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; + iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } return 0; } + +/* Lock two rings at once. The rings must be different! */ +static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) +{ + if (ctx1 > ctx2) + swap(ctx1, ctx2); + mutex_lock(&ctx1->uring_lock); + mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); +} + +/* Both rings are locked by the caller. */ +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, + struct io_uring_clone_buffers *arg) +{ + struct io_rsrc_data data; + int i, ret, off, nr; + unsigned int nbufs; + + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert_held(&src_ctx->uring_lock); + + /* + * Accounting state is shared between the two rings; that only works if + * both rings are accounted towards the same counters. + */ + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) + return -EINVAL; + + /* if offsets are given, must have nr specified too */ + if (!arg->nr && (arg->dst_off || arg->src_off)) + return -EINVAL; + /* not allowed unless REPLACE is set */ + if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) + return -EBUSY; + + nbufs = src_ctx->buf_table.nr; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + return -EINVAL; + else if (arg->nr > IORING_MAX_REG_BUFFERS) + return -EINVAL; + if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) + return -EOVERFLOW; + + ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); + if (ret) + return ret; + + /* Fill entries in data from dst that won't overlap with src */ + for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { + struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; + + if (src_node) { + data.nodes[i] = src_node; + src_node->refs++; + } + } + + ret = -ENXIO; + nbufs = src_ctx->buf_table.nr; + if (!nbufs) + goto out_free; + ret = -EINVAL; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + goto out_free; + ret = -EOVERFLOW; + if (check_add_overflow(arg->nr, arg->src_off, &off)) + goto out_free; + if (off > nbufs) + goto out_free; + + off = arg->dst_off; + i = arg->src_off; + nr = arg->nr; + while (nr--) { + struct io_rsrc_node *dst_node, *src_node; + + src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); + if (!src_node) { + dst_node = NULL; + } else { + dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + if (!dst_node) { + ret = -ENOMEM; + goto out_free; + } + + refcount_inc(&src_node->buf->refs); + dst_node->buf = src_node->buf; + } + data.nodes[off++] = dst_node; + i++; + } + + /* + * If asked for replace, put the old table. data->nodes[] holds both + * old and new nodes at this point. + */ + if (arg->flags & IORING_REGISTER_DST_REPLACE) + io_rsrc_data_free(ctx, &ctx->buf_table); + + /* + * ctx->buf_table must be empty now - either the contents are being + * replaced and we just freed the table, or the contents are being + * copied to a ring that does not have buffers yet (checked at function + * entry). + */ + WARN_ON_ONCE(ctx->buf_table.nr); + ctx->buf_table = data; + return 0; + +out_free: + io_rsrc_data_free(ctx, &data); + return ret; +} + +/* + * Copy the registered buffers from the source ring whose file descriptor + * is given in the src_fd to the current ring. This is identical to registering + * the buffers with ctx, except faster as mappings already exist. + * + * Since the memory is already accounted once, don't account it again. + */ +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_clone_buffers buf; + struct io_ring_ctx *src_ctx; + bool registered_src; + struct file *file; + int ret; + + if (copy_from_user(&buf, arg, sizeof(buf))) + return -EFAULT; + if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) + return -EINVAL; + if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) + return -EBUSY; + if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) + return -EINVAL; + + registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; + file = io_uring_register_get_file(buf.src_fd, registered_src); + if (IS_ERR(file)) + return PTR_ERR(file); + + src_ctx = file->private_data; + if (src_ctx != ctx) { + mutex_unlock(&ctx->uring_lock); + lock_two_rings(ctx, src_ctx); + } + + ret = io_clone_buffers(ctx, src_ctx, &buf); + + if (src_ctx != ctx) + mutex_unlock(&src_ctx->uring_lock); + + fput(file); + return ret; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c6f199bbee28..89ea0135a1a0 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,75 +2,56 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H -#include <net/af_unix.h> - -#include "alloc_cache.h" - -#define IO_NODE_ALLOC_CACHE_MAX 32 - -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) +#include <linux/lockdep.h> enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, }; -struct io_rsrc_put { +struct io_rsrc_node { + unsigned char type; + int refs; + u64 tag; union { - void *rsrc; - struct file *file; + unsigned long file_ptr; struct io_mapped_ubuf *buf; }; }; -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - u16 rsrc_type; - bool quiesce; -}; - -struct io_rsrc_node { - union { - struct io_cache_entry cache; - struct io_ring_ctx *ctx; - }; - int refs; - bool empty; - u16 type; - struct list_head node; - struct io_rsrc_put item; -}; - struct io_mapped_ubuf { u64 ubuf; - u64 ubuf_end; + unsigned int len; unsigned int nr_bvecs; + unsigned int folio_shift; + refcount_t refs; unsigned long acct_pages; struct bio_vec bvec[] __counted_by(nr_bvecs); }; -void io_rsrc_node_ref_zero(struct io_rsrc_node *node); -void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc); +struct io_imu_folio_data { + /* Head folio can be partially included in the fixed buf */ + unsigned int nr_pages_head; + /* For non-head/tail folios, has to be fully included */ + unsigned int nr_pages_mid; + unsigned int folio_shift; + unsigned int nr_folios; +}; + +struct io_rsrc_node *io_rsrc_node_alloc(int type); +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); +void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); +int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len); -void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags); -void __io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags); @@ -82,57 +63,60 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) -{ - lockdep_assert_held(&ctx->uring_lock); - - if (node && !--node->refs) - io_rsrc_node_ref_zero(node); -} +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data); -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) +static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, + int index) { - io_put_rsrc_node(ctx, req->rsrc_node); + if (index < data->nr) + return data->nodes[array_index_nospec(index, data->nr)]; + return NULL; } -static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, - struct io_rsrc_node *node) +static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - node->refs++; + lockdep_assert_held(&ctx->uring_lock); + if (node && !--node->refs) + io_free_rsrc_node(ctx, node); } -static inline void __io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx) +static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_data *data, int index) { - lockdep_assert_held(&ctx->uring_lock); - req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx, ctx->rsrc_node); + struct io_rsrc_node *node = data->nodes[index]; + + if (!node) + return false; + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; + return true; } -static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { - if (!req->rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); - __io_req_set_rsrc_node(req, ctx); - io_ring_submit_unlock(ctx, issue_flags); + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) { + io_put_rsrc_node(req->ctx, req->buf_node); + req->buf_node = NULL; } } -static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, + struct io_rsrc_node *node) { - unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; - unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; - - return &data->tags[table_idx][off]; + node->refs++; + *dst_node = node; } -static inline int io_rsrc_init(struct io_ring_ctx *ctx) +static inline void io_req_assign_buf_node(struct io_kiocb *req, + struct io_rsrc_node *node) { - ctx->rsrc_node = io_rsrc_node_alloc(ctx); - return ctx->rsrc_node ? 0 : -ENOMEM; + io_req_assign_rsrc_node(&req->buf_node, node); + req->flags |= REQ_F_BUF_NODE; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/rw.c b/io_uring/rw.c index d5e79d9bdc71..e5528cebcd06 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -11,16 +11,21 @@ #include <linux/nospec.h> #include <linux/compat.h> #include <linux/io_uring/cmd.h> +#include <linux/indirect_call_wrapper.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "opdef.h" #include "kbuf.h" +#include "alloc_cache.h" #include "rsrc.h" #include "poll.h" #include "rw.h" +static void io_complete_rw(struct kiocb *kiocb, long res); +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -29,9 +34,19 @@ struct io_rw { rwf_t flags; }; -static inline bool io_file_supports_nowait(struct io_kiocb *req) +static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) { - return req->flags & REQ_F_SUPPORT_NOWAIT; + /* If FMODE_NOWAIT is set for a file, we're golden */ + if (req->flags & REQ_F_SUPPORT_NOWAIT) + return true; + /* No FMODE_NOWAIT, if we can poll, check the status */ + if (io_file_can_poll(req)) { + struct poll_table_struct pt = { ._key = mask }; + + return vfs_poll(req->file, &pt) & mask; + } + /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ + return false; } #ifdef CONFIG_COMPAT @@ -74,10 +89,190 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_import_iovec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct iovec *iov; + void __user *buf; + int nr_segs, ret; + size_t sqe_len; + + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; + + if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + + return import_ubuf(ddir, buf, sqe_len, &io->iter); + } + + if (io->free_iovec) { + nr_segs = io->free_iov_nr; + iov = io->free_iovec; + } else { + iov = &io->fast_iov; + nr_segs = 1; + } + ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, + req->ctx->compat); + if (unlikely(ret < 0)) + return ret; + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io->free_iov_nr = io->iter.nr_segs; + kfree(io->free_iovec); + io->free_iovec = iov; + } + return 0; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + int ret; + + ret = __io_import_iovec(rw, req, io, issue_flags); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&io->iter, &io->iter_state); + return 0; +} + +static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_async_rw *rw = req->async_data; + + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) + return; + + io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr); + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + /* + * Disable quick recycling for anything that's gone through io-wq. + * In theory, this should be fine to cleanup. However, some read or + * write iter handling touches the iovec AFTER having called into the + * handler, eg to reexpand or revert. This means we can have: + * + * task io-wq + * issue + * punt to io-wq + * issue + * blkdev_write_iter() + * ->ki_complete() + * io_complete_rw() + * queue tw complete + * run tw + * req_rw_cleanup + * iov_iter_count() <- look at iov_iter again + * + * which can lead to a UAF. This is only possible for io-wq offload + * as the cleanup can run in parallel. As io-wq is not the fast path, + * just leave cleanup to the end. + * + * This is really a bug in the core code that does this, any issue + * path should assume that a successful (or -EIOCBQUEUED) return can + * mean that the underlying data can be gone at any time. But that + * should be fixed seperately, and then this check could be killed. + */ + if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { + req->flags &= ~REQ_F_NEED_CLEANUP; + io_rw_recycle(req, issue_flags); + } +} + +static int io_rw_alloc_async(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_async_rw *rw; + + rw = io_uring_alloc_async_data(&ctx->rw_cache, req); + if (!rw) + return -ENOMEM; + if (rw->free_iovec) + req->flags |= REQ_F_NEED_CLEANUP; + rw->bytes_done = 0; + return 0; +} + +static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) +{ + struct io_async_rw *rw; + + if (io_rw_alloc_async(req)) + return -ENOMEM; + + if (!do_import || io_do_buffer_select(req)) + return 0; + + rw = req->async_data; + return io_import_iovec(ddir, req, rw, 0); +} + +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); + } +} + +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, + u64 attr_ptr, u64 attr_type_mask) +{ + struct io_uring_attr_pi pi_attr; + struct io_async_rw *io; + int ret; + + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), + sizeof(pi_attr))) + return -EFAULT; + + if (pi_attr.rsvd) + return -EINVAL; + + io = req->async_data; + io->meta.flags = pi_attr.flags; + io->meta.app_tag = pi_attr.app_tag; + io->meta.seed = pi_attr.seed; + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), + pi_attr.len, &io->meta.iter); + if (unlikely(ret < 0)) + return ret; + req->flags |= REQ_F_HAS_METADATA; + io_meta_save_state(io); + return ret; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; + u64 attr_type_mask; int ret; rw->kiocb.ki_pos = READ_ONCE(sqe->off); @@ -95,47 +290,106 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; + + if (req->ctx->flags & IORING_SETUP_IOPOLL) + rw->kiocb.ki_complete = io_complete_rw_iopoll; + else + rw->kiocb.ki_complete = io_complete_rw; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return 0; + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(ret)) + return ret; + + attr_type_mask = READ_ONCE(sqe->attr_type_mask); + if (attr_type_mask) { + u64 attr_ptr; + + /* only PI attribute is supported currently */ + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) + return -EINVAL; + + attr_ptr = READ_ONCE(sqe->attr_ptr); + ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + } + return ret; +} + +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw(req, sqe, ITER_DEST, true); +} + +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw(req, sqe, ITER_SOURCE, true); } -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { + const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, do_import); if (unlikely(ret)) return ret; + if (do_import) + return 0; /* * Have to do this validation here, as this is in io_read() rw->len * might have chanaged due to buffer selection */ - if (req->flags & REQ_F_BUFFER_SELECT) - return io_iov_buffer_select_prep(req); + return io_iov_buffer_select_prep(req); +} - return 0; +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_DEST); +} + +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_SOURCE); } -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_ring_ctx *ctx = req->ctx; - u16 index; + struct io_rsrc_node *node; + struct io_async_rw *io; int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, false); if (unlikely(ret)) return ret; - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (!node) return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); - return 0; + io_req_assign_buf_node(req, node); + + io = req->async_data; + ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); + iov_iter_save_state(&io->iter, &io->iter_state); + return ret; +} + +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_DEST); +} + +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_SOURCE); } /* @@ -151,7 +405,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ITER_DEST, false); if (unlikely(ret)) return ret; @@ -164,9 +418,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); + lockdep_assert_held(&req->ctx->uring_lock); + io_rw_recycle(req, 0); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) @@ -186,26 +439,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - -#ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&io->s.iter, &io->s.iter_state); - return true; -} - static bool io_rw_should_reissue(struct io_kiocb *req) { +#ifdef CONFIG_BLOCK + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); umode_t mode = file_inode(req->file)->i_mode; + struct io_async_rw *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) @@ -220,24 +459,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; + + io_meta_restore(io, &rw->kiocb); + iov_iter_restore(&io->iter, &io->iter_state); return true; -} #else -static bool io_resubmit_prep(struct io_kiocb *req) -{ return false; -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - return false; -} #endif +} static void io_req_end_write(struct io_kiocb *req) { @@ -264,23 +493,16 @@ static void io_req_io_end(struct io_kiocb *req) } } -static bool __io_complete_rw_common(struct io_kiocb *req, long res) +static void __io_complete_rw_common(struct io_kiocb *req, long res) { - if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { - /* - * Reissue will start accounting again, finish the - * current cycle. - */ - io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return true; - } + if (res == req->cqe.res) + return; + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + } else { req_set_fail(req); req->cqe.res = res; } - return false; } static inline int io_fixup_rw_res(struct io_kiocb *req, long res) @@ -310,11 +532,10 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_io_end(req); - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); - req->cqe.flags |= io_put_kbuf(req, issue_flags); - } + io_req_rw_cleanup(req, 0); io_req_task_complete(req, ts); } @@ -324,8 +545,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - if (__io_complete_rw_common(req, res)) - return; + __io_complete_rw_common(req, res); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } req->io_task_work.func = io_req_rw_complete; @@ -340,19 +560,20 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return; - } - req->cqe.res = res; + if (res == -EAGAIN && io_rw_should_reissue(req)) + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + else + req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ smp_store_release(&req->iopoll_completed, 1); } -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + /* IO was queued async, completion will happen later */ if (ret == -EIOCBQUEUED) return; @@ -374,8 +595,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } - INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, - io_complete_rw, kiocb, ret); + if (req->ctx->flags & IORING_SETUP_IOPOLL) + io_complete_rw_iopoll(&rw->kiocb, ret); + else + io_complete_rw(&rw->kiocb, ret); } static int kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -386,88 +609,23 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { - if (!__io_complete_rw_common(req, ret)) { - /* - * Safe to call io_end from here as we're inline - * from the submission path. - */ - io_req_io_end(req); - io_req_set_res(req, final_ret, - io_put_kbuf(req, issue_flags)); - return IOU_OK; - } + if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + __io_complete_rw_common(req, ret); + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + io_req_rw_cleanup(req, issue_flags); + return IOU_OK; } else { - io_rw_done(&rw->kiocb, ret); + io_rw_done(req, ret); } - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, final_ret); - } return IOU_ISSUE_SKIP_COMPLETE; } -static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - ret = import_ubuf(ddir, buf, sqe_len, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (IS_ERR(*iovec)) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -539,89 +697,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) return ret; } -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *io = req->async_data; - - memcpy(&io->s.iter, iter, sizeof(*iter)); - io->free_iovec = iovec; - io->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - io->s.iter.__iov = io->s.fast_iov; - if (iter->__iov != fast_iov) { - iov_off = iter_iov(iter) - fast_iov; - io->s.iter.__iov += iov_off; - } - if (io->s.fast_iov != fast_iov) - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_cold_defs[req->opcode].prep_async) - return 0; - /* opcode type doesn't need async data */ - if (!io_cold_defs[req->opcode].async_size) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - iorw->bytes_done = 0; - iorw->free_iovec = NULL; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - if (iov) { - iorw->free_iovec = iov; - req->flags |= REQ_F_NEED_CLEANUP; - } - - return 0; -} - -int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_DEST); -} - -int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_SOURCE); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -670,8 +745,11 @@ static bool io_rw_should_retry(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) + /* + * Never retry for NOWAIT or a request with metadata, we just complete + * with -EAGAIN. + */ + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ @@ -682,7 +760,8 @@ static bool io_rw_should_retry(struct io_kiocb *req) * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks */ - if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + if (io_file_can_poll(req) || + !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) return false; wait->wait.func = io_async_buf_func; @@ -700,7 +779,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) struct file *file = rw->kiocb.ki_filp; if (likely(file->f_op->read_iter)) - return call_read_iter(file, &rw->kiocb, iter); + return file->f_op->read_iter(&rw->kiocb, iter); else if (file->f_op->read) return loop_rw_iter(READ, rw, iter); else @@ -713,7 +792,7 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -721,14 +800,14 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) struct file *file = req->file; int ret; - if (unlikely(!file || !(file->f_mode & mode))) + if (unlikely(!(file->f_mode & mode))) return -EBADF; if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; - ret = kiocb_set_rw_flags(kiocb, rw->flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; @@ -738,22 +817,37 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) * supports async. Otherwise it's impossible to use O_NONBLOCK files * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + if (kiocb->ki_flags & IOCB_NOWAIT || + ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; - kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; - kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once*/ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; - kiocb->ki_complete = io_complete_rw; + } + + if (req->flags & REQ_F_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->ki_flags |= IOCB_HAS_METADATA; + kiocb->private = &io->meta; } return 0; @@ -761,54 +855,27 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) static int __io_read(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *io; - ssize_t ret, ret2; + ssize_t ret; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); + if (io_do_buffer_select(req)) { + ret = io_import_iovec(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; - } else { - io = req->async_data; - s = &io->s; - - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; } - ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); + ret = io_rw_init_file(req, FMODE_READ, READ); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } + if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) + return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { /* Ensure we clear previously set non-block flag */ @@ -818,20 +885,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->iter); - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; - /* - * If we can poll, just do that. For a vectored read, we'll - * need to copy state first. - */ - if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored) + /* + * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT + * issue, even though they should be returning -EAGAIN. To be safe, + * retry from blocking context for either. + */ + if (ret == -EOPNOTSUPP && force_nonblock) + ret = -EAGAIN; + + if (ret == -EAGAIN) { + /* If we can poll, just do that. */ + if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -841,11 +910,10 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { - if (iovec) - kfree(iovec); return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || + (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ goto done; } @@ -855,21 +923,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - iovec = NULL; - if (ret2) { - ret = ret > 0 ? ret : ret2; - goto done; - } - - io = req->async_data; - s = &io->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ + iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); do { /* @@ -877,11 +932,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) + iov_iter_advance(&io->iter, ret); + if (!iov_iter_count(&io->iter)) break; io->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); + iov_iter_save_state(&io->iter, &io->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -889,24 +944,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); + iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); done: /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); return ret; } @@ -930,9 +983,11 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) /* * Multishot MUST be used on a pollable file */ - if (!file_can_poll(req->file)) + if (!io_file_can_poll(req)) return -EBADFD; + /* make it sync, multishot doesn't support async execution */ + rw->kiocb.ki_complete = NULL; ret = __io_read(req, issue_flags); /* @@ -946,23 +1001,26 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ if (io_kbuf_recycle(req, issue_flags)) rw->len = 0; + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; return -EAGAIN; - } - - /* - * Any successful return value will keep the multishot read armed. - */ - if (ret > 0) { + } else if (ret <= 0) { + io_kbuf_recycle(req, issue_flags); + if (ret < 0) + req_set_fail(req); + } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + cflags = io_put_kbuf(req, ret, issue_flags); + } else { /* - * Put our buffer and post a CQE. If we fail to post a CQE, then + * Any successful return value will keep the multishot read + * armed, if it's still set. Put our buffer and post a CQE. If + * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, issue_flags); + cflags = io_put_kbuf(req, ret, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ - if (io_fill_cqe_req_aux(req, - issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, cflags | IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (issue_flags & IO_URING_F_MULTISHOT) { /* * Force retry, as we might have more data to @@ -981,49 +1039,55 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * multishot request, hitting overflow will terminate it. */ io_req_set_res(req, ret, cflags); + io_req_rw_cleanup(req, issue_flags); if (issue_flags & IO_URING_F_MULTISHOT) return IOU_STOP_MULTISHOT; return IOU_OK; } +static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) +{ + struct inode *inode; + bool ret; + + if (!(req->flags & REQ_F_ISREG)) + return true; + if (!(kiocb->ki_flags & IOCB_NOWAIT)) { + kiocb_start_write(kiocb); + return true; + } + + inode = file_inode(kiocb->ki_filp); + ret = sb_start_write_trylock(inode->i_sb); + if (ret) + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + return ret; +} + int io_write(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *io = req->async_data; - - s = &io->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; + if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) + goto ret_eagain; - /* File path supports NOWAIT for non-direct_IO only for block devices. */ + /* Check if we can support NOWAIT. */ if (!(kiocb->ki_flags & IOCB_DIRECT) && - !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; + !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && + (req->flags & REQ_F_ISREG)) + goto ret_eagain; kiocb->ki_flags |= IOCB_NOWAIT; } else { @@ -1034,27 +1098,20 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - if (req->flags & REQ_F_ISREG) - kiocb_start_write(kiocb); + if (unlikely(!io_kiocb_start_write(req, kiocb))) + return -EAGAIN; kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); + ret2 = req->file->f_op->write_iter(kiocb, &io->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &s->iter); + ret2 = loop_rw_iter(WRITE, rw, &io->iter); else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -1067,11 +1124,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; + goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { - struct io_async_rw *io; - trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, req->cqe.res, ret2); @@ -1080,34 +1135,23 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) * in the worker. Also update bytes_done to account for * the bytes already written. */ - iov_iter_save_state(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, true); - - io = req->async_data; - if (io) - io->bytes_done += ret2; + iov_iter_save_state(&io->iter, &io->iter_state); + io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); - return ret ? ret : -EAGAIN; + return -EAGAIN; } done: - ret = kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, issue_flags); } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - if (!ret) { - if (kiocb->ki_flags & IOCB_WRITE) - io_req_end_write(req); - return -EAGAIN; - } - return ret; +ret_eagain: + iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + io_req_end_write(req); + return -EAGAIN; } - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; } void io_rw_fail(struct io_kiocb *req) @@ -1118,6 +1162,78 @@ void io_rw_fail(struct io_kiocb *req) io_req_set_res(req, res, req->cqe.flags); } +static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct file *file = req->file; + + if (req->opcode == IORING_OP_URING_CMD) { + struct io_uring_cmd *ioucmd; + + ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); + } else { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + + return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); + } +} + +static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hrtimer_sleeper timer; + enum hrtimer_mode mode; + ktime_t kt; + u64 sleep_time; + + if (req->flags & REQ_F_IOPOLL_STATE) + return 0; + + if (ctx->hybrid_poll_time == LLONG_MAX) + return 0; + + /* Using half the running time to do schedule */ + sleep_time = ctx->hybrid_poll_time / 2; + + kt = ktime_set(0, sleep_time); + req->flags |= REQ_F_IOPOLL_STATE; + + mode = HRTIMER_MODE_REL; + hrtimer_setup_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&timer.timer, kt); + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_sleeper_start_expires(&timer, mode); + + if (timer.task) + io_schedule(); + + hrtimer_cancel(&timer.timer); + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&timer.timer); + return sleep_time; +} + +static int io_uring_hybrid_poll(struct io_kiocb *req, + struct io_comp_batch *iob, unsigned int poll_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + u64 runtime, sleep_time; + int ret; + + sleep_time = io_hybrid_iopoll_delay(ctx, req); + ret = io_uring_classic_poll(req, iob, poll_flags); + runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + + /* + * Use minimum sleep time if we're polling devices with different + * latencies. We could get more completions from the faster ones. + */ + if (ctx->hybrid_poll_time > runtime) + ctx->hybrid_poll_time = runtime; + + return ret; +} + int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; @@ -1134,7 +1250,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) wq_list_for_each(pos, start, &ctx->iopoll_list) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct file *file = req->file; int ret; /* @@ -1145,29 +1260,23 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (READ_ONCE(req->iopoll_completed)) break; - if (req->opcode == IORING_OP_URING_CMD) { - struct io_uring_cmd *ioucmd; - - ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob, - poll_flags); - } else { - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) + ret = io_uring_hybrid_poll(req, &iob, poll_flags); + else + ret = io_uring_classic_poll(req, &iob, poll_flags); - ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags); - } if (unlikely(ret < 0)) return ret; else if (ret) poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ - if (!rq_list_empty(iob.req_list) || + if (!rq_list_empty(&iob.req_list) || READ_ONCE(req->iopoll_completed)) break; } - if (!rq_list_empty(iob.req_list)) + if (!rq_list_empty(&iob.req_list)) iob.complete(&iob); else if (!pos) return 0; @@ -1180,7 +1289,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - req->cqe.flags = io_put_kbuf(req, 0); + req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); + if (req->opcode != IORING_OP_URING_CMD) + io_req_rw_cleanup(req, 0); } if (unlikely(!nr_events)) return 0; @@ -1194,3 +1305,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) __io_submit_flush_completions(ctx); return nr_events; } + +void io_rw_cache_free(const void *entry) +{ + struct io_async_rw *rw = (struct io_async_rw *) entry; + + if (rw->free_iovec) + kfree(rw->free_iovec); + kfree(rw); +} diff --git a/io_uring/rw.h b/io_uring/rw.h index f9e89b4fe4da..eaa59bd64870 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,28 +2,44 @@ #include <linux/pagemap.h> -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; }; struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; size_t bytes_done; - struct wait_page_queue wpq; + struct iovec *free_iovec; + struct_group(clear, + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov; + int free_iov_nr; + /* + * wpq is for buffered io, while meta fields are used with + * direct io + */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; + ); }; -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); -int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); -int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); +void io_rw_cache_free(const void *entry); diff --git a/io_uring/splice.c b/io_uring/splice.c index 3b659cd23e9d..5b84f1630611 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -21,6 +21,7 @@ struct io_splice { u64 len; int splice_fd_in; unsigned int flags; + struct io_rsrc_node *rsrc_node; }; static int __io_splice_prep(struct io_kiocb *req, @@ -34,6 +35,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + sp->rsrc_node = NULL; req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -45,6 +47,36 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_splice_prep(req, sqe); } +void io_splice_cleanup(struct io_kiocb *req) +{ + struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); + + io_put_rsrc_node(req->ctx, sp->rsrc_node); +} + +static struct file *io_splice_get_file(struct io_kiocb *req, + unsigned int issue_flags) +{ + struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + struct file *file = NULL; + + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + return io_file_get_normal(req, sp->splice_fd_in); + + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->file_table.data, sp->splice_fd_in); + if (node) { + node->refs++; + sp->rsrc_node = node; + file = io_slot_file(node); + req->flags |= REQ_F_NEED_CLEANUP; + } + io_ring_submit_unlock(ctx, issue_flags); + return file; +} + int io_tee(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); @@ -55,10 +87,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); + in = io_splice_get_file(req, issue_flags); if (!in) { ret = -EBADF; goto done; @@ -96,10 +125,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); + in = io_splice_get_file(req, issue_flags); if (!in) { ret = -EBADF; goto done; diff --git a/io_uring/splice.h b/io_uring/splice.h index 542f94168ad3..b9b2848327fb 100644 --- a/io_uring/splice.h +++ b/io_uring/splice.h @@ -3,5 +3,6 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_tee(struct io_kiocb *req, unsigned int issue_flags); +void io_splice_cleanup(struct io_kiocb *req); int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_splice(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 65b5dbe3c850..d037cc68e9d3 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -10,14 +10,17 @@ #include <linux/slab.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/cpuset.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "napi.h" #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 8 enum { IO_SQ_THREAD_SHOULD_STOP = 0, @@ -37,12 +40,13 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) if (atomic_dec_return(&sqd->park_pending)) set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_unlock(&sqd->lock); + wake_up(&sqd->wait); } void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(data_race(sqd->thread) == current); atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); @@ -103,29 +107,21 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) { struct io_ring_ctx *ctx_attach; struct io_sq_data *sqd; - struct fd f; + CLASS(fd, f)(p->wq_fd); - f = fdget(p->wq_fd); - if (!f.file) + if (fd_empty(f)) return ERR_PTR(-ENXIO); - if (!io_is_uring_fops(f.file)) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return ERR_PTR(-EINVAL); - } - ctx_attach = f.file->private_data; + ctx_attach = fd_file(f)->private_data; sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); + if (!sqd) return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); + if (sqd->task_tgid != current->tgid) return ERR_PTR(-EPERM); - } refcount_inc(&sqd->refs); - fdput(f); return sqd; } @@ -174,7 +170,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { + if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) @@ -212,21 +208,73 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) mutex_unlock(&sqd->lock); if (signal_pending(current)) did_sig = get_signal(&ksig); - cond_resched(); + wait_event(sqd->wait, !atomic_read(&sqd->park_pending)); mutex_lock(&sqd->lock); sqd->sq_cpu = raw_smp_processor_id(); } return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); } +/* + * Run task_work, processing the retry_list first. The retry_list holds + * entries that we passed on in the previous run, if we had more task_work + * than we were asked to process. Newly queued task_work isn't run until the + * retry list has been fully processed. + */ +static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) +{ + struct io_uring_task *tctx = current->io_uring; + unsigned int count = 0; + + if (*retry_list) { + *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); + if (count >= max_entries) + goto out; + max_entries -= count; + } + *retry_list = tctx_task_work_run(tctx, max_entries, &count); +out: + if (task_work_pending(current)) + task_work_run(); + return count; +} + +static bool io_sq_tw_pending(struct llist_node *retry_list) +{ + struct io_uring_task *tctx = current->io_uring; + + return retry_list || !llist_empty(&tctx->task_list); +} + +static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) +{ + struct rusage end; + + getrusage(current, RUSAGE_SELF, &end); + end.ru_stime.tv_sec -= start->ru_stime.tv_sec; + end.ru_stime.tv_usec -= start->ru_stime.tv_usec; + + sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; +} + static int io_sq_thread(void *data) { + struct llist_node *retry_list = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; + struct rusage start; unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); + /* offload context creation failed, just exit */ + if (!current->io_uring) { + mutex_lock(&sqd->lock); + sqd->thread = NULL; + mutex_unlock(&sqd->lock); + goto err_out; + } + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); @@ -240,6 +288,14 @@ static int io_sq_thread(void *data) sqd->sq_cpu = raw_smp_processor_id(); } + /* + * Force audit context to get setup, in case we do prep side async + * operations that would trigger an audit call before any issue side + * audit has been done. + */ + audit_uring_entry(IORING_OP_NOP); + audit_uring_exit(true, 0); + mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; @@ -251,18 +307,25 @@ static int io_sq_thread(void *data) } cap_entries = !list_is_singular(&sqd->ctx_list); + getrusage(current, RUSAGE_SELF, &start); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; } - if (io_run_task_work()) + if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + if (io_napi(ctx)) + io_napi_sqpoll_busy_poll(ctx); + if (sqt_spin || !time_after(jiffies, timeout)) { - if (sqt_spin) + if (sqt_spin) { + io_sq_update_worktime(sqd, &start); timeout = jiffies + sqd->sq_thread_idle; + } if (unlikely(need_resched())) { mutex_unlock(&sqd->lock); cond_resched(); @@ -273,7 +336,7 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { + if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) { bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -312,13 +375,16 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; } + if (retry_list) + io_sq_tw(&retry_list, UINT_MAX); + io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); - +err_out: complete(&sqd->exited); do_exit(0); } @@ -343,21 +409,17 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) + CLASS(fd, f)(p->wq_fd); + if (fd_empty(f)) return -ENXIO; - if (!io_is_uring_fops(f.file)) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return -EINVAL; - } - fdput(f); } if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; @@ -393,11 +455,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return 0; if (p->flags & IORING_SETUP_SQ_AFF) { + cpumask_var_t allowed_mask; int cpu = p->sq_thread_cpu; ret = -EINVAL; if (cpu >= nr_cpu_ids || !cpu_online(cpu)) goto err_sqpoll; + ret = -ENOMEM; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + goto err_sqpoll; + ret = -EINVAL; + cpuset_cpus_allowed(current, allowed_mask); + if (!cpumask_test_cpu(cpu, allowed_mask)) { + free_cpumask_var(allowed_mask); + goto err_sqpoll; + } + free_cpumask_var(allowed_mask); sqd->sq_cpu = cpu; } else { sqd->sq_cpu = -1; @@ -412,6 +485,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->thread = tsk; + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -422,11 +496,15 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + if (task_to_put) + put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); + if (task_to_put) + put_task_struct(task_to_put); return ret; } diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 8df37e8c9149..4171666b1cf4 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -16,6 +16,7 @@ struct io_sq_data { pid_t task_pid; pid_t task_tgid; + u64 work_time; unsigned long state; struct completion exited; }; diff --git a/io_uring/statx.c b/io_uring/statx.c index abb874209caa..6bc4651700a2 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -36,9 +36,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sx->flags = READ_ONCE(sqe->statx_flags); - sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags), - NULL); + sx->filename = getname_uflags(path, sx->flags); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index c043fe93a3f2..adc6e42c14df 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -47,8 +47,19 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, void __io_uring_free(struct task_struct *tsk) { struct io_uring_task *tctx = tsk->io_uring; + struct io_tctx_node *node; + unsigned long index; - WARN_ON_ONCE(!xa_empty(&tctx->xa)); + /* + * Fault injection forcing allocation errors in the xa_store() path + * can lead to xa_empty() returning false, even though no actual + * node is stored in the xarray. Until that gets sorted out, attempt + * an iteration here and warn if any entries are found. + */ + xa_for_each(&tctx->xa, index, node) { + WARN_ON_ONCE(1); + break; + } WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); @@ -81,6 +92,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return ret; } + tctx->task = task; xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7fd7dbb211d6..48fc8cf70784 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -72,16 +72,12 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { - bool filled; - filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, - IORING_CQE_F_MORE); - if (filled) { + if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); - data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } @@ -89,7 +85,27 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_task_complete(req, ts); } -static bool io_kill_timeout(struct io_kiocb *req, int status) +static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) +{ + if (list_empty(list)) + return false; + + while (!list_empty(list)) { + struct io_timeout *timeout; + struct io_kiocb *req; + + timeout = list_first_entry(list, struct io_timeout, list); + list_del_init(&timeout->list); + req = cmd_to_io_kiocb(timeout); + if (err) + req_set_fail(req); + io_req_queue_tw_complete(req, err); + } + + return true; +} + +static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -97,23 +113,19 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); - if (status) - req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_queue_tw_complete(req, status); - return true; + list_move_tail(&timeout->list, list); } - return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; struct io_timeout *timeout, *tmp; + LIST_HEAD(list); + u32 seq; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -135,10 +147,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - io_kill_timeout(req, 0); + io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); + io_flush_killed_timeouts(&list, 0); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -204,9 +217,9 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -242,11 +255,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -289,9 +302,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -301,19 +314,20 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; - int ret = -ENOENT; + int ret; if (prev) { - if (!(req->task->flags & PF_EXITING)) { + if (!io_should_terminate_tw()) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, }; - ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); + ret = io_try_cancel(req->tctx, &cd, 0); + } else { + ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, ts); @@ -333,7 +347,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -348,7 +362,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -413,10 +427,12 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; + data->ts = *ts; + list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } @@ -475,12 +491,12 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) @@ -528,10 +544,9 @@ static int __io_timeout_prep(struct io_kiocb *req, if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (io_alloc_async_data(req)) + data = io_uring_alloc_async_data(NULL, req); + if (!data) return -ENOMEM; - - data = req->async_data; data->req = req; data->flags = flags; @@ -541,7 +556,6 @@ static int __io_timeout_prep(struct io_kiocb *req, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; - INIT_LIST_HEAD(&timeout->list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -576,7 +590,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -615,7 +629,7 @@ add: list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -624,7 +638,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -637,18 +651,18 @@ void io_queue_linked_timeout(struct io_kiocb *req) data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, +static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) - __must_hold(&req->ctx->timeout_lock) + __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; - if (task && head->task != task) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; @@ -661,26 +675,26 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, } /* Returns true if we found and killed one or more timeouts */ -__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct io_timeout *timeout, *tmp; - int canceled = 0; + LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tsk, cancel_all) && - io_kill_timeout(req, -ECANCELED)) - canceled++; + if (io_match_task(req, tctx, cancel_all)) + io_kill_timeout(req, &list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); - return canceled != 0; + + return io_flush_killed_timeouts(&list, -ECANCELED); } diff --git a/io_uring/timeout.h b/io_uring/timeout.h index a6939f18313e..e91b32448dcf 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -24,7 +24,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); -__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); void io_queue_linked_timeout(struct io_kiocb *req); void io_disarm_next(struct io_kiocb *req); diff --git a/io_uring/truncate.c b/io_uring/truncate.c new file mode 100644 index 000000000000..62ee73d34d72 --- /dev/null +++ b/io_uring/truncate.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "../fs/internal.h" + +#include "io_uring.h" +#include "truncate.h" + +struct io_ftrunc { + struct file *file; + loff_t len; +}; + +int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); + + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index || + sqe->splice_fd_in || sqe->addr3) + return -EINVAL; + + ft->len = READ_ONCE(sqe->off); + + req->flags |= REQ_F_FORCE_ASYNC; + return 0; +} + +int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); + int ret; + + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); + + ret = do_ftruncate(req->file, ft->len, 1); + + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/truncate.h b/io_uring/truncate.h new file mode 100644 index 000000000000..ec088293a478 --- /dev/null +++ b/io_uring/truncate.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index c33fca585dde..e6701b7aa147 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,16 +3,66 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> +#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> +#include <net/sock.h> #include <uapi/linux/io_uring.h> #include <asm/ioctls.h> #include "io_uring.h" +#include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" +static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_uring_cmd_data *cache = req->async_data; + + if (cache->op_data) { + kfree(cache->op_data); + cache->op_data = NULL; + } + + if (issue_flags & IO_URING_F_UNLOCKED) + return; + if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + ioucmd->sqe = NULL; + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool ret = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, + hash_node) { + struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, + struct io_uring_cmd); + struct file *file = req->file; + + if (!cancel_all && req->tctx != tctx) + continue; + + if (cmd->flags & IORING_URING_CMD_CANCELABLE) { + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | + IO_URING_F_COMPLETE_DEFER); + ret = true; + } + } + io_submit_flush_completions(ctx); + return ret; +} + static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -55,9 +105,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; + unsigned int flags = IO_URING_F_COMPLETE_DEFER; - ioucmd->task_work_cb(ioucmd, issue_flags); + if (io_should_terminate_tw()) + flags |= IO_URING_F_TASK_DEAD; + + /* task_work executor checks the deffered list completion */ + ioucmd->task_work_cb(ioucmd, flags); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -83,7 +137,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -96,24 +150,41 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); + io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); + } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED)) + return; + io_req_complete_defer(req); } else { - struct io_tw_state ts = { - .locked = !(issue_flags & IO_URING_F_UNLOCKED), - }; - io_req_task_complete(req, &ts); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -int io_uring_cmd_prep_async(struct io_kiocb *req) +static int io_uring_cmd_prep_setup(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - - memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = req->async_data; + struct io_uring_cmd_data *cache; + + cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req); + if (!cache) + return -ENOMEM; + cache->op_data = NULL; + + /* + * Unconditionally cache the SQE for now - this is only needed for + * requests that go async, but prep handlers must ensure that any + * sqe data is stable beyond prep. Since uring_cmd is special in + * that it doesn't read in per-op data, play it safe and ensure that + * any SQE data is stable beyond prep. This can later get relaxed. + */ + memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = cache->sqes; return 0; } @@ -130,18 +201,22 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & IORING_URING_CMD_FIXED) { struct io_ring_ctx *ctx = req->ctx; - u16 index; + struct io_rsrc_node *node; + u16 index = READ_ONCE(sqe->buf_index); - req->buf_index = READ_ONCE(sqe->buf_index); - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + node = io_rsrc_node_lookup(&ctx->buf_table, index); + if (unlikely(!node)) return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); + /* + * Pi node upfront, prior to io_uring_cmd_import_fixed() + * being called. This prevents destruction of the mapped buffer + * we'll need at actual import time. + */ + io_req_assign_buf_node(req, node); } - ioucmd->sqe = sqe; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; + + return io_uring_cmd_prep_setup(req, sqe); } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) @@ -172,34 +247,36 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } - - if (ret != -EIOCBQUEUED) { - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); + if (ret == -EAGAIN || ret == -EIOCBQUEUED) return ret; - } - - return IOU_ISSUE_SKIP_COMPLETE; + if (ret < 0) + req_set_fail(req); + io_req_uring_cleanup(req, issue_flags); + io_req_set_res(req, ret, 0); + return IOU_OK; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + struct io_rsrc_node *node = req->buf_node; - return io_import_fixed(rw, iter, req->imu, ubuf, len); + /* Must have had rsrc_node assigned at prep time */ + if (node) + return io_import_fixed(rw, iter, node->buf, ubuf, len); + + return -EFAULT; } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); +void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + io_req_queue_iowq(req); +} + static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -256,7 +333,7 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) if (!prot || !prot->ioctl) return -EOPNOTSUPP; - switch (cmd->sqe->cmd_op) { + switch (cmd->cmd_op) { case SOCKET_URING_OP_SIOCINQ: ret = prot->ioctl(sk, SIOCINQ, &arg); if (ret) diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 8117684ec3ca..f6837ee0955b 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -2,4 +2,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_uring_cmd_prep_async(struct io_kiocb *req); + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, bool cancel_all); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 6f851978606d..15a7daf3ff4f 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,26 +118,18 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = { .locked = true }; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); lockdep_assert_held(&req->ctx->uring_lock); - /* - * Did cancel find it meanwhile? - */ - if (hlist_unhashed(&req->hash_node)) - return; - hlist_del_init(&req->hash_node); ret = io_waitid_finish(req, ret); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -159,6 +151,7 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); + io_req_queue_tw_complete(req, -ECANCELED); return true; } @@ -190,7 +183,7 @@ int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, return -ENOENT; } -bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; @@ -200,8 +193,9 @@ bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (!io_match_task_safe(req, task, cancel_all)) + if (!io_match_task_safe(req, tctx, cancel_all)) continue; + hlist_del_init(&req->hash_node); __io_waitid_cancel(ctx, req); found = true; } @@ -263,6 +257,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); + io_req_task_complete(req, ts); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, @@ -290,10 +285,16 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; + iwa = io_uring_alloc_async_data(NULL, req); + if (!unlikely(iwa)) + return -ENOMEM; + iwa->req = req; + iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); @@ -304,16 +305,10 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; - struct io_waitid_async *iwa; int ret; - if (io_alloc_async_data(req)) - return -ENOMEM; - - iwa = req->async_data; - iwa->req = req; - ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) @@ -336,7 +331,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); - iwa->wo.child_wait.private = req->task; + iwa->wo.child_wait.private = req->tctx->task; iw->head = ¤t->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); diff --git a/io_uring/waitid.h b/io_uring/waitid.h index 956a8adafe8c..d5544aaf302a 100644 --- a/io_uring/waitid.h +++ b/io_uring/waitid.h @@ -11,5 +11,5 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_waitid(struct io_kiocb *req, unsigned int issue_flags); int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); -bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); diff --git a/io_uring/xattr.c b/io_uring/xattr.c index e1c810e0b85a..de5064fcae8a 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -18,7 +18,7 @@ struct io_xattr { struct file *file; - struct xattr_ctx ctx; + struct kernel_xattr_ctx ctx; struct filename *filename; }; @@ -48,13 +48,10 @@ static int __io_getxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - ix->filename = NULL; ix->ctx.kvalue = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ix->ctx.size = READ_ONCE(sqe->len); ix->ctx.flags = READ_ONCE(sqe->xattr_flags); @@ -65,11 +62,8 @@ static int __io_getxattr_prep(struct io_kiocb *req, if (!ix->ctx.kname) return -ENOMEM; - ret = strncpy_from_user(ix->ctx.kname->name, name, - sizeof(ix->ctx.kname->name)); - if (!ret || ret == sizeof(ix->ctx.kname->name)) - ret = -ERANGE; - if (ret < 0) { + ret = import_xattr_name(ix->ctx.kname, name); + if (ret) { kfree(ix->ctx.kname); return ret; } @@ -90,19 +84,20 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) const char __user *path; int ret; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + ret = __io_getxattr_prep(req, sqe); if (ret) return ret; path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } + ix->filename = getname(path); + if (IS_ERR(ix->filename)) + return PTR_ERR(ix->filename); - return ret; + return 0; } int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -112,10 +107,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_getxattr(mnt_idmap(req->file->f_path.mnt), - req->file->f_path.dentry, - &ix->ctx); - + ret = file_getxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); return IOU_OK; } @@ -123,24 +115,12 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = do_getxattr(mnt_idmap(path.mnt), path.dentry, &ix->ctx); - - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - + ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); + ix->filename = NULL; io_xattr_finish(req, ret); return IOU_OK; } @@ -152,9 +132,6 @@ static int __io_setxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - ix->filename = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); @@ -183,19 +160,20 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) const char __user *path; int ret; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + ret = __io_setxattr_prep(req, sqe); if (ret) return ret; path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } + ix->filename = getname(path); + if (IS_ERR(ix->filename)) + return PTR_ERR(ix->filename); - return ret; + return 0; } int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -203,28 +181,14 @@ int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_setxattr_prep(req, sqe); } -static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, - const struct path *path) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - int ret; - - ret = mnt_want_write(path->mnt); - if (!ret) { - ret = do_setxattr(mnt_idmap(path->mnt), path->dentry, &ix->ctx); - mnt_drop_write(path->mnt); - } - - return ret; -} - int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) { + struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = __io_setxattr(req, issue_flags, &req->file->f_path); + ret = file_setxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); return IOU_OK; } @@ -232,23 +196,12 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = __io_setxattr(req, issue_flags, &path); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - + ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); + ix->filename = NULL; io_xattr_finish(req, ret); return IOU_OK; } |