diff options
Diffstat (limited to 'io_uring/kbuf.c')
-rw-r--r-- | io_uring/kbuf.c | 861 |
1 files changed, 381 insertions, 480 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 18df5a9d2f5e..2ea65f3cef72 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/vmalloc.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -14,15 +15,13 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) - -#define BGID_ARRAY 64 +#include "memmap.h" /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) -struct kmem_cache *io_buf_cachep; +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] struct io_provide_buf { struct file *file; @@ -33,21 +32,39 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; +static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) +{ + while (len) { + struct io_uring_buf *buf; + u32 this_len; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + this_len = min_t(int, len, buf->len); + buf->len -= this_len; + if (buf->len) { + buf->addr += this_len; + return false; + } + bl->head++; + len -= this_len; + } + return true; +} -static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, - unsigned int bgid) +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) { - if (bl && bgid < BGID_ARRAY) - return &bl[bgid]; + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; - return xa_load(&ctx->io_bl_xa, bgid); + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + if (bl->flags & IOBL_INC) + return io_kbuf_inc_commit(bl, len); + bl->head += nr; + return true; } static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, @@ -55,7 +72,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, ctx->io_bl, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -64,77 +81,40 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - smp_store_release(&bl->is_ready, 1); - - if (bgid < BGID_ARRAY) - return 0; - + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +void io_kbuf_drop_legacy(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(req->kbuf); + req->kbuf = NULL; +} + bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; struct io_buffer *buf; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if (req->flags & REQ_F_PARTIAL_IO) - return false; - io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); + bl->nbufs++; req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); return true; } -unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) -{ - unsigned int cflags; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf_list(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); - } - return cflags; -} - static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl) { @@ -143,8 +123,11 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); + bl->nbufs--; if (*len == 0 || *len > kbuf->len) *len = kbuf->len; + if (list_empty(&bl->buf_list)) + req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; req->buf_index = kbuf->bid; @@ -153,34 +136,46 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + struct iovec *iov) +{ + void __user *buf; + + buf = io_provided_buffer_select(req, len, bl); + if (unlikely(!buf)) + return -ENOBUFS; + + iov[0].iov_base = buf; + iov[0].iov_len = *len; + return 1; +} + static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) { struct io_uring_buf_ring *br = bl->buf_ring; + __u16 tail, head = bl->head; struct io_uring_buf *buf; - __u16 head = bl->head; + void __user *ret; - if (unlikely(smp_load_acquire(&br->tail) == head)) + tail = smp_load_acquire(&br->tail); + if (unlikely(tail == head)) return NULL; - head &= bl->mask; - /* mmaped buffers are always contig */ - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } + if (head + 1 == tail) + req->flags |= REQ_F_BL_EMPTY; + + buf = io_ring_head_to_buf(br, head, bl->mask); if (*len == 0 || *len > buf->len) *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { + if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* * If we came in unlocked, we have no choice but to consume the * buffer here, otherwise nothing ensures that the buffer won't @@ -191,14 +186,14 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ + io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; - bl->head++; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -206,9 +201,9 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, io_ring_submit_lock(req->ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { - if (bl->is_mapped) + if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -217,124 +212,226 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +/* cap it at a reasonable 256, will be one page even for 4K */ +#define PEEK_MAX_IMPORT 256 + +static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_buffer_list *bl) { - struct io_buffer_list *bl; - int i; + struct io_uring_buf_ring *br = bl->buf_ring; + struct iovec *iov = arg->iovs; + int nr_iovs = arg->nr_iovs; + __u16 nr_avail, tail, head; + struct io_uring_buf *buf; - bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); - if (!bl) - return -ENOMEM; + tail = smp_load_acquire(&br->tail); + head = bl->head; + nr_avail = min_t(__u16, tail - head, UIO_MAXIOV); + if (unlikely(!nr_avail)) + return -ENOBUFS; + + buf = io_ring_head_to_buf(br, head, bl->mask); + if (arg->max_len) { + u32 len = READ_ONCE(buf->len); + size_t needed; + + if (unlikely(!len)) + return -ENOBUFS; + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&bl[i].buf_list); - bl[i].bgid = i; + /* + * only alloc a bigger array if we know we have data to map, eg not + * a speculative peek operation. + */ + if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + if (arg->mode & KBUF_MODE_FREE) + kfree(arg->iovs); + arg->iovs = iov; + nr_iovs = nr_avail; + } else if (nr_avail < nr_iovs) { + nr_iovs = nr_avail; } - smp_store_release(&ctx->io_bl, bl); - return 0; + /* set it to max, if not set, so we can use it unconditionally */ + if (!arg->max_len) + arg->max_len = INT_MAX; + + req->buf_index = buf->bid; + do { + u32 len = buf->len; + + /* truncate end piece, if needed, for non partial buffers */ + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) + buf->len = len; + } + + iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_len = len; + iov++; + + arg->out_len += len; + arg->max_len -= len; + if (!arg->max_len) + break; + + buf = io_ring_head_to_buf(br, ++head, bl->mask); + } while (--nr_iovs); + + if (head == tail) + req->flags |= REQ_F_BL_EMPTY; + + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + return iov - arg->iovs; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags) { - struct io_buf_free *ibf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = -ENOENT; + + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, arg->buf_group); + if (unlikely(!bl)) + goto out_unlock; - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; + if (bl->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, bl); + /* + * Don't recycle these buffers if we need to go through poll. + * Nobody else can use them anyway, and holding on to provided + * buffers for a send/write operation would happen on the app + * side anyway with normal buffers. Besides, we already + * committed them, they cannot be put back in the queue. + */ + if (ret > 0) { + req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; + io_kbuf_commit(req, bl, arg->out_len, ret); } + } else { + ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + } +out_unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + bl = io_buffer_get_list(ctx, arg->buf_group); + if (unlikely(!bl)) + return -ENOENT; + + if (bl->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, bl); + if (ret > 0) + req->flags |= REQ_F_BUFFERS_COMMIT; + return ret; } - /* can't happen... */ - WARN_ON_ONCE(1); + /* don't support multiple buffer selections for legacy */ + return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) { - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->is_mapped) { - i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { - int j; - - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - } - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->is_mapped = 0; - return i; + struct io_buffer_list *bl = req->buf_list; + bool ret = true; + + if (bl) + ret = io_kbuf_commit(req, bl, len, nr); + + req->flags &= ~REQ_F_BUFFER_RING; + return ret; +} + +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) +{ + unsigned int ret; + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { + io_kbuf_drop_legacy(req); + return ret; } + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + return ret; +} + +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) +{ + unsigned long i = 0; + struct io_buffer *nxt; + /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_move(&nxt->list, &ctx->io_buffers_cache); - if (++i == nbufs) - return i; + list_del(&nxt->list); + bl->nbufs--; + kfree(nxt); cond_resched(); } - return i; } +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + + kfree(bl); +} + void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; - struct list_head *item, *tmp; - struct io_buffer *buf; - unsigned long index; - int i; - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } + while (1) { + unsigned long index = 0; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; + io_put_bl(ctx, bl); } +} - /* - * Move deferred locked entries to cache before pruning - */ - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { - buf = list_entry(item, struct io_buffer, list); - kmem_cache_free(io_buf_cachep, buf); - } +static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); + io_put_bl(ctx, bl); } int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -356,30 +453,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->is_mapped) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -395,14 +468,14 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe p->nbufs = tmp; p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); + if (!p->len) + return -EINVAL; if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, &size)) return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; @@ -416,67 +489,29 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -#define IO_BUFFER_ALLOC_BATCH 64 - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; - int allocated; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * batch of buffer entries and add those to our freelist. - */ - - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, - ARRAY_SIZE(bufs), (void **) bufs); - if (unlikely(!allocated)) { - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); - if (!bufs[0]) - return -ENOMEM; - allocated = 1; - } - - while (allocated) - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); - - return 0; -} - static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; - int i, bid = pbuf->bid; + int ret = -ENOMEM, i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) + /* + * Nonsensical to have more than sizeof(bid) buffers in a + * buffer list, as the application then has no way of knowing + * which duplicate bid refers to what buffer. + */ + if (bl->nbufs == USHRT_MAX) { + ret = -EOVERFLOW; + break; + } + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); + if (!buf) break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); + + list_add_tail(&buf->list, &bl->buf_list); + bl->nbufs++; buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; @@ -486,239 +521,133 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, cond_resched(); } - return i ? 0 : -ENOMEM; + return i ? 0 : ret; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. Also can't - * be a lower indexed array group, as adding one - * where lookup failed cannot happen. - */ - if (p->bgid >= BGID_ARRAY) - kfree_rcu(bl, rcu); - else - WARN_ON_ONCE(1); - goto err; + kfree(bl); + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->is_mapped) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br; - struct page **pages; - int i, nr_pages; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - /* - * Apparently some 32-bit boxes (ARM) will return highmem pages, - * which then need to be mapped. We could support that, but it'd - * complicate the code and slowdown the common cases quite a bit. - * So just error out, returning -EINVAL just like we did on kernels - * that didn't support mapped buffer rings. - */ - for (i = 0; i < nr_pages; i++) - if (PageHighMem(pages[i])) - goto error_unpin; - - br = page_address(pages[0]); -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) - goto error_unpin; -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->is_mapped = 1; - bl->is_mmap = 0; - return 0; -error_unpin: - for (i = 0; i < nr_pages; i++) - unpin_user_page(pages[i]); - kvfree(pages); - return -EINVAL; -} - -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_buf_free *ibf; - size_t ring_size; - void *ptr; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); - } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; - bl->is_mapped = 1; - bl->is_mmap = 1; - return 0; + return IOU_COMPLETE; } int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; - if (reg.flags & ~IOU_PBUF_RING_MMAP) + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->is_mapped || !list_empty(&bl->buf_list)) + if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; + io_destroy_bl(ctx, bl); } - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); + if (!bl) + return -ENOMEM; - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); + +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; + } +#endif - kfree_rcu(free_bl, rcu); + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); + kfree(bl); return ret; } @@ -731,22 +660,19 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->is_mapped) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { + scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); - } + + io_put_bl(ctx, bl); return 0; } @@ -754,19 +680,16 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) return -ENOENT; - if (!bl->is_mapped) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; buf_status.head = bl->head; @@ -776,37 +699,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { struct io_buffer_list *bl; - bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); + lockdep_assert_held(&ctx->mmap_lock); - if (!bl || !bl->is_mmap) - return NULL; - /* - * Ensure the list is fully setup. Only strictly needed for RCU lookup - * via mmap, and in that case only for the array indexed groups. For - * the xarray lookups, it's either visible and ready, or not at all. - */ - if (!smp_load_acquire(&bl->is_ready)) + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) return NULL; - - return bl->buf_ring; -} - -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) -{ - struct io_buf_free *ibf; - struct hlist_node *tmp; - - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + return &bl->region; } |