diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 368 |
1 files changed, 214 insertions, 154 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index aa8ac557493c..f9a38998f2fc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -197,6 +197,7 @@ struct io_ring_ctx { unsigned sq_entries; unsigned sq_mask; unsigned sq_thread_idle; + unsigned cached_sq_dropped; struct io_uring_sqe *sq_sqes; struct list_head defer_list; @@ -212,6 +213,7 @@ struct io_ring_ctx { struct { unsigned cached_cq_tail; + atomic_t cached_cq_overflow; unsigned cq_entries; unsigned cq_mask; struct wait_queue_head cq_wait; @@ -322,6 +324,8 @@ struct io_kiocb { #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ #define REQ_F_TIMEOUT 1024 /* timeout request */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ u64 user_data; u32 result; u32 sequence; @@ -415,27 +419,28 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; } +static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped + + atomic_read(&ctx->cached_cq_overflow); +} + static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { - /* timeout requests always honor sequence */ - if (!(req->flags & REQ_F_TIMEOUT) && - (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; + return __io_sequence_defer(ctx, req); } -static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, - struct list_head *list) +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; - if (list_empty(list)) - return NULL; - - req = list_first_entry(list, struct io_kiocb, list); - if (!io_sequence_defer(ctx, req)) { + req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); + if (req && !io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; } @@ -443,14 +448,17 @@ static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, return NULL; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) -{ - return __io_get_deferred_req(ctx, &ctx->defer_list); -} - static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) { - return __io_get_deferred_req(ctx, &ctx->timeout_list); + struct io_kiocb *req; + + req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); + if (req && !__io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; } static void __io_commit_cqring(struct io_ring_ctx *ctx) @@ -562,9 +570,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); - - WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1); + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); } } @@ -591,14 +598,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, io_cqring_ev_posted(ctx); } -static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) -{ - percpu_ref_put_many(&ctx->refs, refs); - - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); -} - static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -646,7 +645,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; return req; out: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); return NULL; } @@ -654,7 +653,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); - io_ring_drop_ctx_refs(ctx, *nr); + percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; } } @@ -663,7 +662,7 @@ static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - io_ring_drop_ctx_refs(req->ctx, 1); + percpu_ref_put(&req->ctx->refs); kmem_cache_free(req_cachep, req); } @@ -738,6 +737,14 @@ static unsigned io_cqring_events(struct io_rings *rings) return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); } +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; +} + /* * Find and free completed poll iocbs */ @@ -867,19 +874,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) { - int iters, ret = 0; + int iters = 0, ret = 0; - /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - - iters = 0; do { int tmin = 0; @@ -915,30 +914,45 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + return ret; +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret; + + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + ret = __io_iopoll_check(ctx, nr_events, min); mutex_unlock(&ctx->uring_lock); return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -950,7 +964,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1064,8 +1079,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + return -EAGAIN; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1086,7 +1110,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1099,6 +1124,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; + req->result = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1387,7 +1413,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -1452,7 +1480,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -1867,7 +1895,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_ring_ctx *ctx; - struct io_kiocb *req; + struct io_kiocb *req, *prev; unsigned long flags; req = container_of(timer, struct io_kiocb, timeout.timer); @@ -1875,6 +1903,15 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) atomic_inc(&ctx->cq_timeouts); spin_lock_irqsave(&ctx->completion_lock, flags); + /* + * Adjust the reqs sequence before the current one because it + * will consume a slot in the cq_ring and the the cq_tail pointer + * will be increased, otherwise other timeout reqs may return in + * advance without waiting for enough wait_nr. + */ + prev = req; + list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) + prev->sequence++; list_del(&req->list); io_cqring_fill_event(ctx, req->user_data, -ETIME); @@ -1889,18 +1926,19 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - unsigned count, req_dist, tail_index; + unsigned count; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; - struct timespec ts; + struct timespec64 ts; + unsigned span = 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || sqe->len != 1) return -EINVAL; - if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, - sizeof(ts))) + + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; /* @@ -1912,29 +1950,52 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) count = 1; req->sequence = ctx->cached_sq_head + count - 1; + /* reuse it to store the count */ + req->submit.sequence = count; req->flags |= REQ_F_TIMEOUT; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ - tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; - req_dist = req->sequence - tail_index; spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned dist; + unsigned nxt_sq_head; + long long tmp, tmp_nxt; - dist = nxt->sequence - tail_index; - if (req_dist >= dist) + /* + * Since cached_sq_head + count - 1 can overflow, use type long + * long to store it. + */ + tmp = (long long)ctx->cached_sq_head + count - 1; + nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; + tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + + /* + * cached_sq_head may overflow, and it will never overflow twice + * once there is some timeout req still be valid. + */ + if (ctx->cached_sq_head < nxt_sq_head) + tmp += UINT_MAX; + + if (tmp > tmp_nxt) break; + + /* + * Sequence of reqs after the insert one and itself should + * be adjusted because each timeout req consumes a slot. + */ + span++; + nxt->sequence++; } + req->sequence -= span; list_add(&req->list, entry); spin_unlock_irq(&ctx->completion_lock); hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), HRTIMER_MODE_REL); return 0; } @@ -2267,12 +2328,18 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, } static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, bool force_nonblock) + struct sqe_submit *s) { int ret; - ret = __io_submit_sqe(ctx, req, s, force_nonblock); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + ret = __io_submit_sqe(ctx, req, s, true); + + /* + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts + */ + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { struct io_uring_sqe *sqe_copy; sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); @@ -2312,7 +2379,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, bool force_nonblock) + struct sqe_submit *s) { int ret; @@ -2325,18 +2392,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } - return __io_queue_sqe(ctx, req, s, force_nonblock); + return __io_queue_sqe(ctx, req, s); } static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, struct io_kiocb *shadow, - bool force_nonblock) + struct sqe_submit *s, struct io_kiocb *shadow) { int ret; int need_submit = false; if (!shadow) - return io_queue_sqe(ctx, req, s, force_nonblock); + return io_queue_sqe(ctx, req, s); /* * Mark the first IO in link list as DRAIN, let all the following @@ -2348,6 +2414,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, if (ret) { if (ret != -EIOCBQUEUED) { io_free_req(req); + __io_free_req(shadow); io_cqring_add_event(ctx, s->sqe->user_data, ret); return 0; } @@ -2365,7 +2432,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, spin_unlock_irq(&ctx->completion_lock); if (need_submit) - return __io_queue_sqe(ctx, req, s, force_nonblock); + return __io_queue_sqe(ctx, req, s); return 0; } @@ -2373,8 +2440,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link, - bool force_nonblock) + struct io_submit_state *state, struct io_kiocb **link) { struct io_uring_sqe *sqe_copy; struct io_kiocb *req; @@ -2401,6 +2467,8 @@ err: return; } + req->user_data = s->sqe->user_data; + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -2427,7 +2495,7 @@ err: INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s, force_nonblock); + io_queue_sqe(ctx, req, s); } } @@ -2507,12 +2575,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; - rings->sq_dropped++; + ctx->cached_sq_dropped++; + WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); return false; } -static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, - unsigned int nr, bool has_user, bool mm_fault) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, + bool has_user, bool mm_fault) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2526,19 +2595,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, } for (i = 0; i < nr; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req, - true); + io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; shadow_req = NULL; } - prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; - if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); if (unlikely(!shadow_req)) @@ -2546,24 +2619,24 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } - shadow_req->sequence = sqes[i].sequence; + shadow_req->sequence = s.sequence; } out: if (unlikely(mm_fault)) { - io_cqring_add_event(ctx, sqes[i].sqe->user_data, + io_cqring_add_event(ctx, s.sqe->user_data, -EFAULT); } else { - sqes[i].has_user = has_user; - sqes[i].needs_lock = true; - sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link, true); + s.has_user = has_user; + s.needs_lock = true; + s.needs_fixed_file = true; + io_submit_sqe(ctx, &s, statep, &link); submitted++; } } if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req, true); + io_queue_link_head(ctx, link, &link->submit, shadow_req); if (statep) io_submit_state_end(&state); @@ -2572,7 +2645,6 @@ out: static int io_sq_thread(void *data) { - struct sqe_submit sqes[IO_IOPOLL_BATCH]; struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; mm_segment_t old_fs; @@ -2587,14 +2659,27 @@ static int io_sq_thread(void *data) timeout = inflight = 0; while (!kthread_should_park()) { - bool all_fixed, mm_fault = false; - int i; + bool mm_fault = false; + unsigned int to_submit; if (inflight) { unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - io_iopoll_check(ctx, &nr_events, 0); + /* + * inflight is the count of the maximum possible + * entries we submitted, but it can be smaller + * if we dropped some of them. If we don't have + * poll entries available, then we know that we + * have nothing left to poll for. Reset the + * inflight count to zero in that case. + */ + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + __io_iopoll_check(ctx, &nr_events, 0); + else + inflight = 0; + mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -2608,7 +2693,8 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { /* * We're polling. If we're within the defined idle * period, then let us spin without work before going @@ -2639,7 +2725,8 @@ static int io_sq_thread(void *data) /* make sure to read SQ tail after writing flags */ smp_mb(); - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; @@ -2657,19 +2744,8 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - i = 0; - all_fixed = true; - do { - if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) - all_fixed = false; - - i++; - if (i == ARRAY_SIZE(sqes)) - break; - } while (io_get_sqring(ctx, &sqes[i])); - /* Unless all new commands are FIXED regions, grab mm */ - if (!all_fixed && !cur_mm) { + if (!cur_mm) { mm_fault = !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -2677,8 +2753,9 @@ static int io_sq_thread(void *data) } } - inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, - mm_fault); + to_submit = min(to_submit, ctx->sq_entries); + inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL, + mm_fault); /* Commit SQ ring head once we've consumed all SQEs */ io_commit_sqring(ctx); @@ -2695,8 +2772,7 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, - bool block_for_last) +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2710,7 +2786,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, } for (i = 0; i < to_submit; i++) { - bool force_nonblock = true; struct sqe_submit s; if (!io_get_sqring(ctx, &s)) @@ -2721,8 +2796,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req, - force_nonblock); + io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; shadow_req = NULL; } @@ -2744,27 +2818,16 @@ out: s.needs_lock = false; s.needs_fixed_file = false; submit++; - - /* - * The caller will block for events after submit, submit the - * last IO non-blocking. This is either the only IO it's - * submitting, or it already submitted the previous ones. This - * improves performance by avoiding an async punt that we don't - * need to do. - */ - if (block_for_last && submit == to_submit) - force_nonblock = false; - - io_submit_sqe(ctx, &s, statep, &link, force_nonblock); + io_submit_sqe(ctx, &s, statep, &link); } - io_commit_sqring(ctx); if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req, - block_for_last); + io_queue_link_head(ctx, link, &link->submit, shadow_req); if (statep) io_submit_state_end(statep); + io_commit_sqring(ctx); + return submit; } @@ -2920,8 +2983,12 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) + if (ctx->sqo_wq[i]) + flush_workqueue(ctx->sqo_wq[i]); - io_finish_async(ctx); unix_destruct_scm(skb); } @@ -3601,21 +3668,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - bool block_for_last = false; - to_submit = min(to_submit, ctx->sq_entries); - /* - * Allow last submission to block in a series, IFF the caller - * asked to wait for events and we don't currently have - * enough. This potentially avoids an async punt. - */ - if (to_submit == min_complete && - io_cqring_events(ctx->rings) < min_complete) - block_for_last = true; - mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit, block_for_last); + submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3630,7 +3686,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; @@ -3774,10 +3830,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret) goto err; - ret = io_uring_get_fd(ctx); - if (ret < 0) - goto err; - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -3795,6 +3847,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + /* + * Install ring fd as the very last thing, so we don't risk someone + * having closed it before we finish setup + */ + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: |