summaryrefslogtreecommitdiff
path: root/io_uring/napi.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/napi.c')
-rw-r--r--io_uring/napi.c396
1 files changed, 396 insertions, 0 deletions
diff --git a/io_uring/napi.c b/io_uring/napi.c
new file mode 100644
index 000000000000..b1ade3fda30f
--- /dev/null
+++ b/io_uring/napi.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "io_uring.h"
+#include "napi.h"
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+/* Timeout for cleanout of stale entries. */
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct io_napi_entry {
+ unsigned int napi_id;
+ struct list_head list;
+
+ unsigned long timeout;
+ struct hlist_node node;
+
+ struct rcu_head rcu;
+};
+
+static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
+ unsigned int napi_id)
+{
+ struct io_napi_entry *e;
+
+ hlist_for_each_entry_rcu(e, hash_list, node) {
+ if (e->napi_id != napi_id)
+ continue;
+ return e;
+ }
+
+ return NULL;
+}
+
+static inline ktime_t net_to_ktime(unsigned long t)
+{
+ /* napi approximating usecs, reverse busy_loop_current_time */
+ return ns_to_ktime(t << 10);
+}
+
+int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+ struct hlist_head *hash_list;
+ struct io_napi_entry *e;
+
+ /* Non-NAPI IDs can be rejected. */
+ if (napi_id < MIN_NAPI_ID)
+ return -EINVAL;
+
+ hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+
+ scoped_guard(rcu) {
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (e) {
+ WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
+ return -EEXIST;
+ }
+ }
+
+ e = kmalloc(sizeof(*e), GFP_NOWAIT);
+ if (!e)
+ return -ENOMEM;
+
+ e->napi_id = napi_id;
+ e->timeout = jiffies + NAPI_TIMEOUT;
+
+ /*
+ * guard(spinlock) is not used to manually unlock it before calling
+ * kfree()
+ */
+ spin_lock(&ctx->napi_lock);
+ if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
+ spin_unlock(&ctx->napi_lock);
+ kfree(e);
+ return -EEXIST;
+ }
+
+ hlist_add_tail_rcu(&e->node, hash_list);
+ list_add_tail_rcu(&e->list, &ctx->napi_list);
+ spin_unlock(&ctx->napi_lock);
+ return 0;
+}
+
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+ struct hlist_head *hash_list;
+ struct io_napi_entry *e;
+
+ /* Non-NAPI IDs can be rejected. */
+ if (napi_id < MIN_NAPI_ID)
+ return -EINVAL;
+
+ hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+ guard(spinlock)(&ctx->napi_lock);
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (!e)
+ return -ENOENT;
+
+ list_del_rcu(&e->list);
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ return 0;
+}
+
+static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
+{
+ struct io_napi_entry *e;
+
+ guard(spinlock)(&ctx->napi_lock);
+ /*
+ * list_for_each_entry_safe() is not required as long as:
+ * 1. list_del_rcu() does not reset the deleted node next pointer
+ * 2. kfree_rcu() delays the memory freeing until the next quiescent
+ * state
+ */
+ list_for_each_entry(e, &ctx->napi_list, list) {
+ if (time_after(jiffies, READ_ONCE(e->timeout))) {
+ list_del_rcu(&e->list);
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ }
+ }
+}
+
+static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
+{
+ if (is_stale)
+ __io_napi_remove_stale(ctx);
+}
+
+static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
+ ktime_t bp)
+{
+ if (bp) {
+ ktime_t end_time = ktime_add(start_time, bp);
+ ktime_t now = net_to_ktime(busy_loop_current_time());
+
+ return ktime_after(now, end_time);
+ }
+
+ return true;
+}
+
+static bool io_napi_busy_loop_should_end(void *data,
+ unsigned long start_time)
+{
+ struct io_wait_queue *iowq = data;
+
+ if (signal_pending(current))
+ return true;
+ if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+ return true;
+ if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
+ iowq->napi_busy_poll_dt))
+ return true;
+
+ return false;
+}
+
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ struct io_napi_entry *e;
+
+ list_for_each_entry_rcu(e, &ctx->napi_list, list)
+ napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+ ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+ return false;
+}
+
+static bool
+dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ struct io_napi_entry *e;
+ bool is_stale = false;
+
+ list_for_each_entry_rcu(e, &ctx->napi_list, list) {
+ napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+ ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+
+ if (time_after(jiffies, READ_ONCE(e->timeout)))
+ is_stale = true;
+ }
+
+ return is_stale;
+}
+
+static inline bool
+__io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+ return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+ return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+}
+
+static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time = busy_loop_current_time();
+ bool (*loop_end)(void *, unsigned long) = NULL;
+ void *loop_end_arg = NULL;
+ bool is_stale = false;
+
+ /* Singular lists use a different napi loop end check function and are
+ * only executed once.
+ */
+ if (list_is_singular(&ctx->napi_list)) {
+ loop_end = io_napi_busy_loop_should_end;
+ loop_end_arg = iowq;
+ }
+
+ scoped_guard(rcu) {
+ do {
+ is_stale = __io_napi_do_busy_loop(ctx, loop_end,
+ loop_end_arg);
+ } while (!io_napi_busy_loop_should_end(iowq, start_time) &&
+ !loop_end_arg);
+ }
+
+ io_napi_remove_stale(ctx, is_stale);
+}
+
+/*
+ * io_napi_init() - Init napi settings
+ * @ctx: pointer to io-uring context structure
+ *
+ * Init napi settings in the io-uring context.
+ */
+void io_napi_init(struct io_ring_ctx *ctx)
+{
+ u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
+
+ INIT_LIST_HEAD(&ctx->napi_list);
+ spin_lock_init(&ctx->napi_lock);
+ ctx->napi_prefer_busy_poll = false;
+ ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+ ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
+}
+
+/*
+ * io_napi_free() - Deallocate napi
+ * @ctx: pointer to io-uring context structure
+ *
+ * Free the napi list and the hash table in the io-uring context.
+ */
+void io_napi_free(struct io_ring_ctx *ctx)
+{
+ struct io_napi_entry *e;
+
+ guard(spinlock)(&ctx->napi_lock);
+ list_for_each_entry(e, &ctx->napi_list, list) {
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ }
+ INIT_LIST_HEAD_RCU(&ctx->napi_list);
+}
+
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+ struct io_uring_napi *napi)
+{
+ switch (napi->op_param) {
+ case IO_URING_NAPI_TRACKING_DYNAMIC:
+ case IO_URING_NAPI_TRACKING_STATIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* clean the napi list for new settings */
+ io_napi_free(ctx);
+ WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
+ WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+ WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+ return 0;
+}
+
+/*
+ * io_napi_register() - Register napi with io-uring
+ * @ctx: pointer to io-uring context structure
+ * @arg: pointer to io_uring_napi structure
+ *
+ * Register napi in the io-uring context.
+ */
+int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
+{
+ const struct io_uring_napi curr = {
+ .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
+ .prefer_busy_poll = ctx->napi_prefer_busy_poll,
+ .op_param = ctx->napi_track_mode
+ };
+ struct io_uring_napi napi;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ return -EINVAL;
+ if (copy_from_user(&napi, arg, sizeof(napi)))
+ return -EFAULT;
+ if (napi.pad[0] || napi.pad[1] || napi.resv)
+ return -EINVAL;
+
+ if (copy_to_user(arg, &curr, sizeof(curr)))
+ return -EFAULT;
+
+ switch (napi.opcode) {
+ case IO_URING_NAPI_REGISTER_OP:
+ return io_napi_register_napi(ctx, &napi);
+ case IO_URING_NAPI_STATIC_ADD_ID:
+ if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+ return -EINVAL;
+ return __io_napi_add_id(ctx, napi.op_param);
+ case IO_URING_NAPI_STATIC_DEL_ID:
+ if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+ return -EINVAL;
+ return __io_napi_del_id(ctx, napi.op_param);
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * io_napi_unregister() - Unregister napi with io-uring
+ * @ctx: pointer to io-uring context structure
+ * @arg: pointer to io_uring_napi structure
+ *
+ * Unregister napi. If arg has been specified copy the busy poll timeout and
+ * prefer busy poll setting to the passed in structure.
+ */
+int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
+{
+ const struct io_uring_napi curr = {
+ .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
+ .prefer_busy_poll = ctx->napi_prefer_busy_poll
+ };
+
+ if (arg && copy_to_user(arg, &curr, sizeof(curr)))
+ return -EFAULT;
+
+ WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
+ WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
+ WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
+ return 0;
+}
+
+/*
+ * __io_napi_busy_loop() - execute busy poll loop
+ * @ctx: pointer to io-uring context structure
+ * @iowq: pointer to io wait queue
+ *
+ * Execute the busy poll loop and merge the spliced off list.
+ */
+void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
+{
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ return;
+
+ iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
+ if (iowq->timeout != KTIME_MAX) {
+ ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
+
+ iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
+ }
+
+ iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+ io_napi_blocking_busy_loop(ctx, iowq);
+}
+
+/*
+ * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
+ * @ctx: pointer to io-uring context structure
+ *
+ * Splice of the napi list and execute the napi busy poll loop.
+ */
+int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
+{
+ bool is_stale = false;
+
+ if (!READ_ONCE(ctx->napi_busy_poll_dt))
+ return 0;
+ if (list_empty_careful(&ctx->napi_list))
+ return 0;
+
+ scoped_guard(rcu) {
+ is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
+ }
+
+ io_napi_remove_stale(ctx, is_stale);
+ return 1;
+}
+
+#endif