diff options
Diffstat (limited to 'io_uring/rsrc.c')
-rw-r--r-- | io_uring/rsrc.c | 657 |
1 files changed, 249 insertions, 408 deletions
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6f3b6de230bd..adaae8630932 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -13,7 +13,6 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" -#include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" @@ -26,21 +25,13 @@ struct io_rsrc_update { u32 offset; }; -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage); +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -static const struct io_mapped_ubuf dummy_ubuf = { - /* set invalid range, so io_import_fixed() fails meeting it */ - .ubuf = -1UL, - .len = UINT_MAX, -}; - int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -110,13 +101,13 @@ static int io_buffer_validate(struct iovec *iov) return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_mapped_ubuf *imu = *slot; unsigned int i; - *slot = NULL; - if (imu != &dummy_ubuf) { + if (node->buf) { + struct io_mapped_ubuf *imu = node->buf; + if (!refcount_dec_and_test(&imu->refs)) return; for (i = 0; i < imu->nr_bvecs; i++) @@ -127,205 +118,40 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo } } -static void io_rsrc_put_work(struct io_rsrc_node *node) -{ - struct io_rsrc_put *prsrc = &node->item; - - if (prsrc->tag) - io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - - switch (node->type) { - case IORING_RSRC_FILE: - fput(prsrc->file); - break; - case IORING_RSRC_BUFFER: - io_rsrc_buf_put(node->ctx, prsrc); - break; - default: - WARN_ON_ONCE(1); - break; - } -} - -void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) -{ - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) - kfree(node); -} - -void io_rsrc_node_ref_zero(struct io_rsrc_node *node) - __must_hold(&node->ctx->uring_lock) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { - struct io_ring_ctx *ctx = node->ctx; + struct io_rsrc_node *node; - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (node->refs) - break; - list_del(&node->node); - - if (likely(!node->empty)) - io_rsrc_put_work(node); - io_rsrc_node_destroy(ctx, node); + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (node) { + node->type = type; + node->refs = 1; } - if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) - wake_up_all(&ctx->rsrc_quiesce_wq); + return node; } -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { - struct io_rsrc_node *ref_node; - - ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (!ref_node) { - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + if (!data->nr) + return; + while (data->nr--) { + if (data->nodes[data->nr]) + io_put_rsrc_node(ctx, data->nodes[data->nr]); } - - ref_node->ctx = ctx; - ref_node->empty = 0; - ref_node->refs = 1; - return ref_node; + kvfree(data->nodes); + data->nodes = NULL; + data->nr = 0; } -__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) +__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) { - struct io_rsrc_node *backup; - DEFINE_WAIT(we); - int ret; - - /* As We may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - backup = io_rsrc_node_alloc(ctx); - if (!backup) - return -ENOMEM; - ctx->rsrc_node->empty = true; - ctx->rsrc_node->type = -1; - list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, ctx->rsrc_node); - ctx->rsrc_node = backup; - - if (list_empty(&ctx->rsrc_ref_list)) + data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (data->nodes) { + data->nr = nr; return 0; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - } - - ctx->rsrc_quiesce++; - data->quiesce = true; - do { - prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); - mutex_unlock(&ctx->uring_lock); - - ret = io_run_task_work_sig(ctx); - if (ret < 0) { - finish_wait(&ctx->rsrc_quiesce_wq, &we); - mutex_lock(&ctx->uring_lock); - if (list_empty(&ctx->rsrc_ref_list)) - ret = 0; - break; - } - - schedule(); - mutex_lock(&ctx->uring_lock); - ret = 0; - } while (!list_empty(&ctx->rsrc_ref_list)); - - finish_wait(&ctx->rsrc_quiesce_wq, &we); - data->quiesce = false; - ctx->rsrc_quiesce--; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 0); - smp_mb(); } - return ret; -} - -static void io_free_page_table(void **table, size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); -} - -static void io_rsrc_data_free(struct io_rsrc_data *data) -{ - size_t size = data->nr * sizeof(data->tags[0][0]); - - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); -} - -static __cold void **io_alloc_page_table(size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; - - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; - - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; - } - return table; -} - -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, - u64 __user *utags, - unsigned nr, struct io_rsrc_data **pdata) -{ - struct io_rsrc_data *data; - int ret = 0; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; - } - - data->nr = nr; - data->ctx = ctx; - data->rsrc_type = type; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } - } - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; + return -ENOMEM; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -334,14 +160,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) + if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { @@ -359,19 +183,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (fd == IORING_REGISTER_FILES_SKIP) continue; - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - err = io_queue_rsrc_removal(data, i, - io_slot_file(file_slot)); - if (err) - break; - file_slot->file_ptr = 0; + i = up->offset + done; + if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); - } + if (fd != -1) { struct file *file = fget(fd); + struct io_rsrc_node *node; if (!file) { err = -EBADF; @@ -385,8 +203,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) { + err = -ENOMEM; + fput(file); + break; + } + ctx->file_table.data.nodes[i] = node; + if (tag) + node->tag = tag; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } } @@ -405,13 +231,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, __u32 done; int i, err; - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) + if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; u64 tag = 0; uvec = u64_to_user_ptr(user_data); @@ -427,27 +253,21 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(iov); if (err) break; - if (!iov->iov_base && tag) { - err = -EINVAL; + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + err = PTR_ERR(node); break; } - err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != &dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); + if (tag) { + if (!node) { + err = -EINVAL; break; } - ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; + node->tag = tag; } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, i) = tag; + i = array_index_nospec(up->offset + done, ctx->buf_table.nr); + io_reset_rsrc_node(ctx, &ctx->buf_table, i); + ctx->buf_table.nodes[i] = node; if (ctx->compat) user_data += sizeof(struct compat_iovec); else @@ -563,7 +383,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, struct file *file; int ret, fd; - if (!req->ctx->file_data) + if (!req->ctx->file_table.data.nr) return -ENXIO; for (done = 0; done < up->nr_args; done++) { @@ -622,65 +442,38 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = data->ctx; - struct io_rsrc_node *node = ctx->rsrc_node; - u64 *tag_slot = io_get_tag_slot(data, idx); - - ctx->rsrc_node = io_rsrc_node_alloc(ctx); - if (unlikely(!ctx->rsrc_node)) { - ctx->rsrc_node = node; - return -ENOMEM; - } - - node->item.rsrc = rsrc; - node->type = data->rsrc_type; - node->item.tag = *tag_slot; - *tag_slot = 0; - list_add_tail(&node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, node); - return 0; -} - -void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - int i; + lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(&ctx->file_table, i); + if (node->tag) + io_post_aux_cqe(ctx, node->tag, 0, 0); - if (!file) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); + switch (node->type) { + case IORING_RSRC_FILE: + if (io_slot_file(node)) + fput(io_slot_file(node)); + break; + case IORING_RSRC_BUFFER: + if (node->buf) + io_buffer_unmap(ctx, node); + break; + default: + WARN_ON_ONCE(1); + break; } - io_free_file_tables(&ctx->file_table); - io_file_table_set_alloc_range(ctx, 0, 0); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; + kfree(node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; + io_free_file_tables(ctx, &ctx->file_table); + io_file_table_set_alloc_range(ctx, 0, 0); + return 0; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -691,7 +484,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int fd, ret; unsigned i; - if (ctx->file_data) + if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; @@ -699,28 +492,22 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; + if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; - } - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; + ret = -EFAULT; + if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) + goto fail; + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; - } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) + if (tag) goto fail; continue; } @@ -737,56 +524,33 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto fail; } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); + ret = -ENOMEM; + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) { + fput(file); + goto fail; + } + if (tag) + node->tag = tag; + ctx->file_table.data.nodes[i] = node; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ - io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); + io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: - __io_sqe_files_unregister(ctx); + io_sqe_files_unregister(ctx); return ret; } -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; + io_rsrc_data_free(ctx, &ctx->buf_table); + return 0; } /* @@ -812,9 +576,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, } /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + for (i = 0; i < ctx->buf_table.nr; i++) { + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; + struct io_mapped_ubuf *imu; + if (!node) + continue; + imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; @@ -950,21 +718,26 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, + struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; + struct io_rsrc_node *node; unsigned long off; size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; bool coalesced; - *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) - return 0; + return NULL; + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) + return ERR_PTR(-ENOMEM); + node->buf = NULL; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -998,7 +771,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); - *pimu = imu; + node->buf = imu; ret = 0; for (i = 0; i < nr_pages; i++) { @@ -1010,46 +783,42 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } done: - if (ret) + if (ret) { kvfree(imu); + if (node) + io_put_rsrc_node(ctx, node); + node = ERR_PTR(ret); + } kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; + return node; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; - struct io_rsrc_data *data; + struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - if (ctx->user_bufs) + if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); + ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } if (!arg) memset(iov, 0, sizeof(*iov)); - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; + if (arg) { uvec = (struct iovec __user *) arg; iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); @@ -1066,22 +835,31 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, arg += sizeof(struct iovec); } - if (!iov->iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; + if (tags) { + if (copy_from_user(&tag, &tags[i], sizeof(tag))) { + ret = -EFAULT; + break; + } } - ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + ret = PTR_ERR(node); break; + } + if (tag) { + if (!node) { + ret = -EINVAL; + break; + } + node->tag = tag; + } + data.nodes[i] = node; } - WARN_ON_ONCE(ctx->buf_data); - - ctx->buf_data = data; + ctx->buf_table = data; if (ret) - __io_sqe_buffers_unregister(ctx); + io_sqe_buffers_unregister(ctx); return ret; } @@ -1127,7 +905,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - iter->bvec = bvec; iter->count -= offset; iter->iov_offset = offset; } else { @@ -1137,7 +914,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, offset -= bvec->bv_len; seg_skip = 1 + (offset >> imu->folio_shift); - iter->bvec = bvec + seg_skip; + iter->bvec += seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); @@ -1147,11 +924,43 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return 0; } -static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, + struct io_uring_clone_buffers *arg) { - struct io_mapped_ubuf **user_bufs; - struct io_rsrc_data *data; - int i, ret, nbufs; + struct io_rsrc_data data; + int i, ret, off, nr; + unsigned int nbufs; + + /* if offsets are given, must have nr specified too */ + if (!arg->nr && (arg->dst_off || arg->src_off)) + return -EINVAL; + /* not allowed unless REPLACE is set */ + if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) + return -EBUSY; + + nbufs = READ_ONCE(src_ctx->buf_table.nr); + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + return -EINVAL; + else if (arg->nr > IORING_MAX_REG_BUFFERS) + return -EINVAL; + if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) + return -EOVERFLOW; + + ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); + if (ret) + return ret; + + /* Fill entries in data from dst that won't overlap with src */ + for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { + struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; + + if (src_node) { + data.nodes[i] = src_node; + src_node->refs++; + } + } /* * Drop our own lock here. We'll setup the data we need and reference @@ -1161,45 +970,77 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx mutex_lock(&src_ctx->uring_lock); ret = -ENXIO; - nbufs = src_ctx->nr_user_bufs; + nbufs = src_ctx->buf_table.nr; if (!nbufs) goto out_unlock; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); - if (ret) + ret = -EINVAL; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + goto out_unlock; + ret = -EOVERFLOW; + if (check_add_overflow(arg->nr, arg->src_off, &off)) + goto out_unlock; + if (off > nbufs) goto out_unlock; - ret = -ENOMEM; - user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); - if (!user_bufs) - goto out_free_data; + off = arg->dst_off; + i = arg->src_off; + nr = arg->nr; + while (nr--) { + struct io_rsrc_node *dst_node, *src_node; - for (i = 0; i < nbufs; i++) { - struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; + src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); + if (!src_node) { + dst_node = NULL; + } else { + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!dst_node) { + ret = -ENOMEM; + goto out_put_free; + } - if (src != &dummy_ubuf) - refcount_inc(&src->refs); - user_bufs[i] = src; + refcount_inc(&src_node->buf->refs); + dst_node->buf = src_node->buf; + } + data.nodes[off++] = dst_node; + i++; } /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ mutex_unlock(&src_ctx->uring_lock); mutex_lock(&ctx->uring_lock); - if (!ctx->user_bufs) { - ctx->user_bufs = user_bufs; - ctx->buf_data = data; - ctx->nr_user_bufs = nbufs; + + /* + * If asked for replace, put the old table. data->nodes[] holds both + * old and new nodes at this point. + */ + if (arg->flags & IORING_REGISTER_DST_REPLACE) + io_rsrc_data_free(ctx, &ctx->buf_table); + + /* + * ctx->buf_table should be empty now - either the contents are being + * replaced and we just freed the table, or someone raced setting up + * a buffer table while the clone was happening. If not empty, fall + * through to failure handling. + */ + if (!ctx->buf_table.nr) { + ctx->buf_table = data; return 0; } + mutex_unlock(&ctx->uring_lock); + mutex_lock(&src_ctx->uring_lock); /* someone raced setting up buffers, dump ours */ - for (i = 0; i < nbufs; i++) - io_buffer_unmap(ctx, &user_bufs[i]); - io_rsrc_data_free(data); - kfree(user_bufs); - return -EBUSY; -out_free_data: - io_rsrc_data_free(data); + ret = -EBUSY; +out_put_free: + i = data.nr; + while (i--) { + io_buffer_unmap(src_ctx, data.nodes[i]); + kfree(data.nodes[i]); + } out_unlock: + io_rsrc_data_free(ctx, &data); mutex_unlock(&src_ctx->uring_lock); mutex_lock(&ctx->uring_lock); return ret; @@ -1219,12 +1060,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) struct file *file; int ret; - if (ctx->user_bufs || ctx->nr_user_bufs) - return -EBUSY; if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; - if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) + if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) return -EINVAL; + if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) + return -EBUSY; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL; @@ -1232,7 +1073,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); - ret = io_clone_buffers(ctx, file->private_data); + ret = io_clone_buffers(ctx, file->private_data, &buf); if (!registered_src) fput(file); return ret; |