diff options
Diffstat (limited to 'io_uring/napi.c')
| -rw-r--r-- | io_uring/napi.c | 270 |
1 files changed, 166 insertions, 104 deletions
diff --git a/io_uring/napi.c b/io_uring/napi.c index 8c18ede595c4..4a10de03e426 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -26,74 +26,100 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, hlist_for_each_entry_rcu(e, hash_list, node) { if (e->napi_id != napi_id) continue; - e->timeout = jiffies + NAPI_TIMEOUT; return e; } return NULL; } -void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) +static inline ktime_t net_to_ktime(unsigned long t) +{ + /* napi approximating usecs, reverse busy_loop_current_time */ + return ns_to_ktime(t << 10); +} + +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; - unsigned int napi_id; - struct sock *sk; struct io_napi_entry *e; - sk = sock->sk; - if (!sk) - return; - - napi_id = READ_ONCE(sk->sk_napi_id); - /* Non-NAPI IDs can be rejected. */ - if (napi_id < MIN_NAPI_ID) - return; + if (!napi_id_valid(napi_id)) + return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; - rcu_read_lock(); - e = io_napi_hash_find(hash_list, napi_id); - if (e) { - e->timeout = jiffies + NAPI_TIMEOUT; - rcu_read_unlock(); - return; + scoped_guard(rcu) { + e = io_napi_hash_find(hash_list, napi_id); + if (e) { + WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); + return -EEXIST; + } } - rcu_read_unlock(); e = kmalloc(sizeof(*e), GFP_NOWAIT); if (!e) - return; + return -ENOMEM; e->napi_id = napi_id; e->timeout = jiffies + NAPI_TIMEOUT; + /* + * guard(spinlock) is not used to manually unlock it before calling + * kfree() + */ spin_lock(&ctx->napi_lock); if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); - return; + return -EEXIST; } hlist_add_tail_rcu(&e->node, hash_list); - list_add_tail(&e->list, &ctx->napi_list); + list_add_tail_rcu(&e->list, &ctx->napi_list); spin_unlock(&ctx->napi_lock); + return 0; +} + +static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) +{ + struct hlist_head *hash_list; + struct io_napi_entry *e; + + /* Non-NAPI IDs can be rejected. */ + if (!napi_id_valid(napi_id)) + return -EINVAL; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + guard(spinlock)(&ctx->napi_lock); + e = io_napi_hash_find(hash_list, napi_id); + if (!e) + return -ENOENT; + + list_del_rcu(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + return 0; } static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - unsigned int i; - spin_lock(&ctx->napi_lock); - hash_for_each(ctx->napi_ht, i, e, node) { - if (time_after(jiffies, e->timeout)) { - list_del(&e->list); + guard(spinlock)(&ctx->napi_lock); + /* + * list_for_each_entry_safe() is not required as long as: + * 1. list_del_rcu() does not reset the deleted node next pointer + * 2. kfree_rcu() delays the memory freeing until the next quiescent + * state + */ + list_for_each_entry(e, &ctx->napi_list, list) { + if (time_after(jiffies, READ_ONCE(e->timeout))) { + list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); } } - spin_unlock(&ctx->napi_lock); } static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) @@ -102,14 +128,14 @@ static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) __io_napi_remove_stale(ctx); } -static inline bool io_napi_busy_loop_timeout(unsigned long start_time, - unsigned long bp_usec) +static inline bool io_napi_busy_loop_timeout(ktime_t start_time, + ktime_t bp) { - if (bp_usec) { - unsigned long end_time = start_time + bp_usec; - unsigned long now = busy_loop_current_time(); + if (bp) { + ktime_t end_time = ktime_add(start_time, bp); + ktime_t now = net_to_ktime(busy_loop_current_time()); - return time_after(now, end_time); + return ktime_after(now, end_time); } return true; @@ -124,51 +150,80 @@ static bool io_napi_busy_loop_should_end(void *data, return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; - if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + if (io_napi_busy_loop_timeout(net_to_ktime(start_time), + iowq->napi_busy_poll_dt)) return true; return false; } -static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, - void *loop_end_arg) +/* + * never report stale entries + */ +static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { struct io_napi_entry *e; - bool (*loop_end)(void *, unsigned long) = NULL; - bool is_stale = false; - if (loop_end_arg) - loop_end = io_napi_busy_loop_should_end; + list_for_each_entry_rcu(e, &ctx->napi_list, list) + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + return false; +} + +static bool +dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + struct io_napi_entry *e; + bool is_stale = false; list_for_each_entry_rcu(e, &ctx->napi_list, list) { napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); - if (time_after(jiffies, e->timeout)) + if (time_after(jiffies, READ_ONCE(e->timeout))) is_stale = true; } return is_stale; } +static inline bool +__io_napi_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); +} + static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { unsigned long start_time = busy_loop_current_time(); + bool (*loop_end)(void *, unsigned long) = NULL; void *loop_end_arg = NULL; bool is_stale = false; /* Singular lists use a different napi loop end check function and are * only executed once. */ - if (list_is_singular(&ctx->napi_list)) + if (list_is_singular(&ctx->napi_list)) { + loop_end = io_napi_busy_loop_should_end; loop_end_arg = iowq; + } - rcu_read_lock(); - do { - is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); - } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); - rcu_read_unlock(); + scoped_guard(rcu) { + do { + is_stale = __io_napi_do_busy_loop(ctx, loop_end, + loop_end_arg); + } while (!io_napi_busy_loop_should_end(iowq, start_time) && + !loop_end_arg); + } io_napi_remove_stale(ctx, is_stale); } @@ -181,10 +236,13 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, */ void io_napi_init(struct io_ring_ctx *ctx) { + u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; + INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; - ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); + ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); + ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; } /* @@ -196,15 +254,31 @@ void io_napi_init(struct io_ring_ctx *ctx) void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - LIST_HEAD(napi_list); - unsigned int i; - spin_lock(&ctx->napi_lock); - hash_for_each(ctx->napi_ht, i, e, node) { + guard(spinlock)(&ctx->napi_lock); + list_for_each_entry(e, &ctx->napi_list, list) { hash_del_rcu(&e->node); kfree_rcu(e, rcu); } - spin_unlock(&ctx->napi_lock); + INIT_LIST_HEAD_RCU(&ctx->napi_list); +} + +static int io_napi_register_napi(struct io_ring_ctx *ctx, + struct io_uring_napi *napi) +{ + switch (napi->op_param) { + case IO_URING_NAPI_TRACKING_DYNAMIC: + case IO_URING_NAPI_TRACKING_STATIC: + break; + default: + return -EINVAL; + } + /* clean the napi list for new settings */ + io_napi_free(ctx); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); + WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + return 0; } /* @@ -217,23 +291,36 @@ void io_napi_free(struct io_ring_ctx *ctx) int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, - .prefer_busy_poll = ctx->napi_prefer_busy_poll + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), + .prefer_busy_poll = ctx->napi_prefer_busy_poll, + .op_param = ctx->napi_track_mode }; struct io_uring_napi napi; + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; - if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) + if (napi.pad[0] || napi.pad[1] || napi.resv) return -EINVAL; if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); - WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); - WRITE_ONCE(ctx->napi_enabled, true); - return 0; + switch (napi.opcode) { + case IO_URING_NAPI_REGISTER_OP: + return io_napi_register_napi(ctx, &napi); + case IO_URING_NAPI_STATIC_ADD_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_add_id(ctx, napi.op_param); + case IO_URING_NAPI_STATIC_DEL_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_del_id(ctx, napi.op_param); + default: + return -EINVAL; + } } /* @@ -247,52 +334,20 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_enabled, false); + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); return 0; } /* - * __io_napi_adjust_timeout() - adjust busy loop timeout - * @ctx: pointer to io-uring context structure - * @iowq: pointer to io wait queue - * @ts: pointer to timespec or NULL - * - * Adjust the busy loop timeout according to timespec and busy poll timeout. - * If the specified NAPI timeout is bigger than the wait timeout, then adjust - * the NAPI timeout accordingly. - */ -void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) -{ - unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); - - if (ts) { - struct timespec64 poll_to_ts; - - poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); - if (timespec64_compare(ts, &poll_to_ts) < 0) { - s64 poll_to_ns = timespec64_to_ns(ts); - if (poll_to_ns > 0) { - u64 val = poll_to_ns + 999; - do_div(val, (s64) 1000); - poll_to = val; - } - } - } - - iowq->napi_busy_poll_to = poll_to; -} - -/* * __io_napi_busy_loop() - execute busy poll loop * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue @@ -301,10 +356,18 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow */ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { - iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + if (ctx->flags & IORING_SETUP_SQPOLL) + return; - if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) - io_napi_blocking_busy_loop(ctx, iowq); + iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); + if (iowq->timeout != KTIME_MAX) { + ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); + + iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); + } + + iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + io_napi_blocking_busy_loop(ctx, iowq); } /* @@ -315,17 +378,16 @@ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) */ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { - LIST_HEAD(napi_list); bool is_stale = false; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; - rcu_read_lock(); - is_stale = __io_napi_do_busy_loop(ctx, NULL); - rcu_read_unlock(); + scoped_guard(rcu) { + is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); + } io_napi_remove_stale(ctx, is_stale); return 1; |
