summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 14:56:09 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 14:56:09 -0700
commit625434dafdd97372d15de21972be4b682709e854 (patch)
tree4cdc02f8732525e17024877ff9fbd9c67d9faf7f
parentc05a182bf45681c5529a58c71ce5647535b3ae7a (diff)
parent7b289c38335ec7bebe45ed31137d596c808e23ac (diff)
Merge tag 'for-5.13/io_uring-2021-04-27' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: - Support for multi-shot mode for POLL requests - More efficient reference counting. This is shamelessly stolen from the mm side. Even though referencing is mostly single/dual user, the 128 count was retained to keep the code the same. Maybe this should/could be made generic at some point. - Removal of the need to have a manager thread for each ring. The manager threads only job was checking and creating new io-threads as needed, instead we handle this from the queue path. - Allow SQPOLL without CAP_SYS_ADMIN or CAP_SYS_NICE. Since 5.12, this thread is "just" a regular application thread, so no need to restrict use of it anymore. - Cleanup of how internal async poll data lifetime is managed. - Fix for syzbot reported crash on SQPOLL cancelation. - Make buffer registration more like file registrations, which includes flexibility in avoiding full set unregistration and re-registration. - Fix for io-wq affinity setting. - Be a bit more defensive in task->pf_io_worker setup. - Various SQPOLL fixes. - Cleanup of SQPOLL creds handling. - Improvements to in-flight request tracking. - File registration cleanups. - Tons of cleanups and little fixes * tag 'for-5.13/io_uring-2021-04-27' of git://git.kernel.dk/linux-block: (156 commits) io_uring: maintain drain logic for multishot poll requests io_uring: Check current->io_uring in io_uring_cancel_sqpoll io_uring: fix NULL reg-buffer io_uring: simplify SQPOLL cancellations io_uring: fix work_exit sqpoll cancellations io_uring: Fix uninitialized variable up.resv io_uring: fix invalid error check after malloc io_uring: io_sq_thread() no longer needs to reset current->pf_io_worker kernel: always initialize task->pf_io_worker to NULL io_uring: update sq_thread_idle after ctx deleted io_uring: add full-fledged dynamic buffers support io_uring: implement fixed buffers registration similar to fixed files io_uring: prepare fixed rw for dynanic buffers io_uring: keep table of pointers to ubufs io_uring: add generic rsrc update with tags io_uring: add IORING_REGISTER_RSRC io_uring: enumerate dynamic resources io_uring: add generic path for rsrc update io_uring: preparation for rsrc tagging io_uring: decouple CQE filling from requests ...
-rw-r--r--fs/io-wq.c336
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c2602
-rw-r--r--include/linux/io_uring.h12
-rw-r--r--include/linux/task_work.h2
-rw-r--r--include/trace/events/io_uring.h11
-rw-r--r--include/uapi/linux/io_uring.h40
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/task_work.c35
9 files changed, 1594 insertions, 1448 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4eba531bea5a..5361a9b4b47b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,7 +17,6 @@
#include <linux/cpu.h>
#include <linux/tracehook.h>
-#include "../kernel/sched/sched.h"
#include "io-wq.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
@@ -68,6 +67,7 @@ struct io_worker {
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
+ int index;
atomic_t nr_running;
};
@@ -108,19 +108,16 @@ struct io_wq {
free_work_fn *free_work;
io_wq_work_fn *do_work;
- struct task_struct *manager;
-
struct io_wq_hash *hash;
refcount_t refs;
- struct completion exited;
atomic_t worker_refs;
struct completion worker_done;
struct hlist_node cpuhp_node;
- pid_t task_pid;
+ struct task_struct *task;
};
static enum cpuhp_state io_wq_online;
@@ -133,8 +130,7 @@ struct io_cb_cancel_data {
bool cancel_all;
};
-static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match);
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
static bool io_worker_get(struct io_worker *worker)
{
@@ -147,23 +143,26 @@ static void io_worker_release(struct io_worker *worker)
complete(&worker->ref_done);
}
+static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
+{
+ return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
+}
+
static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
struct io_wq_work *work)
{
- if (work->flags & IO_WQ_WORK_UNBOUND)
- return &wqe->acct[IO_WQ_ACCT_UNBOUND];
-
- return &wqe->acct[IO_WQ_ACCT_BOUND];
+ return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
}
static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
{
- struct io_wqe *wqe = worker->wqe;
-
- if (worker->flags & IO_WORKER_F_BOUND)
- return &wqe->acct[IO_WQ_ACCT_BOUND];
+ return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
+}
- return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+static void io_worker_ref_put(struct io_wq *wq)
+{
+ if (atomic_dec_and_test(&wq->worker_refs))
+ complete(&wq->worker_done);
}
static void io_worker_exit(struct io_worker *worker)
@@ -193,8 +192,7 @@ static void io_worker_exit(struct io_worker *worker)
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
- if (atomic_dec_and_test(&wqe->wq->worker_refs))
- complete(&wqe->wq->worker_done);
+ io_worker_ref_put(wqe->wq);
do_exit(0);
}
@@ -209,7 +207,7 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe)
/*
* Check head of free list for an available worker. If one isn't available,
- * caller must wake up the wq manager to create one.
+ * caller must create one.
*/
static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
__must_hold(RCU)
@@ -233,7 +231,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
/*
* We need a worker. If we find a free one, we're good. If not, and we're
- * below the max number of workers, wake up the manager to create one.
+ * below the max number of workers, create one.
*/
static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
@@ -249,8 +247,11 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
ret = io_wqe_activate_free_worker(wqe);
rcu_read_unlock();
- if (!ret && acct->nr_workers < acct->max_workers)
- wake_up_process(wqe->wq->manager);
+ if (!ret && acct->nr_workers < acct->max_workers) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ create_io_worker(wqe->wq, wqe, acct->index);
+ }
}
static void io_wqe_inc_running(struct io_worker *worker)
@@ -260,14 +261,61 @@ static void io_wqe_inc_running(struct io_worker *worker)
atomic_inc(&acct->nr_running);
}
+struct create_worker_data {
+ struct callback_head work;
+ struct io_wqe *wqe;
+ int index;
+};
+
+static void create_worker_cb(struct callback_head *cb)
+{
+ struct create_worker_data *cwd;
+ struct io_wq *wq;
+
+ cwd = container_of(cb, struct create_worker_data, work);
+ wq = cwd->wqe->wq;
+ create_io_worker(wq, cwd->wqe, cwd->index);
+ kfree(cwd);
+}
+
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+{
+ struct create_worker_data *cwd;
+ struct io_wq *wq = wqe->wq;
+
+ /* raced with exit, just ignore create call */
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
+ goto fail;
+
+ cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
+ if (cwd) {
+ init_task_work(&cwd->work, create_worker_cb);
+ cwd->wqe = wqe;
+ cwd->index = acct->index;
+ if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
+ return;
+
+ kfree(cwd);
+ }
+fail:
+ atomic_dec(&acct->nr_running);
+ io_worker_ref_put(wq);
+}
+
static void io_wqe_dec_running(struct io_worker *worker)
__must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
- if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
- io_wqe_wake_worker(wqe, acct);
+ if (!(worker->flags & IO_WORKER_F_UP))
+ return;
+
+ if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(wqe, acct);
+ }
}
/*
@@ -280,6 +328,8 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
{
bool worker_bound, work_bound;
+ BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1);
+
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
hlist_nulls_del_init_rcu(&worker->nulls_node);
@@ -292,16 +342,11 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
if (worker_bound != work_bound) {
+ int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND;
io_wqe_dec_running(worker);
- if (work_bound) {
- worker->flags |= IO_WORKER_F_BOUND;
- wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
- wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- } else {
- worker->flags &= ~IO_WORKER_F_BOUND;
- wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
- wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
- }
+ worker->flags ^= IO_WORKER_F_BOUND;
+ wqe->acct[index].nr_workers--;
+ wqe->acct[index ^ 1].nr_workers++;
io_wqe_inc_running(worker);
}
}
@@ -486,9 +531,8 @@ static int io_wqe_worker(void *data)
char buf[TASK_COMM_LEN];
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
- io_wqe_inc_running(worker);
- snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task_pid);
+ snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
@@ -552,8 +596,7 @@ void io_wq_worker_running(struct task_struct *tsk)
/*
* Called when worker is going to sleep. If there are no workers currently
- * running and we have work pending, wake up a free one or have the manager
- * set one up.
+ * running and we have work pending, wake up a free one or create a new one.
*/
void io_wq_worker_sleeping(struct task_struct *tsk)
{
@@ -573,7 +616,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
raw_spin_unlock_irq(&worker->wqe->lock);
}
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
@@ -583,7 +626,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker)
- return false;
+ goto fail;
refcount_set(&worker->ref, 1);
worker->nulls_node.pprev = NULL;
@@ -591,14 +634,13 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
spin_lock_init(&worker->lock);
init_completion(&worker->ref_done);
- atomic_inc(&wq->worker_refs);
-
tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
if (IS_ERR(tsk)) {
- if (atomic_dec_and_test(&wq->worker_refs))
- complete(&wq->worker_done);
kfree(worker);
- return false;
+fail:
+ atomic_dec(&acct->nr_running);
+ io_worker_ref_put(wq);
+ return;
}
tsk->pf_io_worker = worker;
@@ -617,20 +659,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
wake_up_new_task(tsk);
- return true;
-}
-
-static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
- __must_hold(wqe->lock)
-{
- struct io_wqe_acct *acct = &wqe->acct[index];
-
- if (acct->nr_workers && test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state))
- return false;
- /* if we have available workers or no work, no need */
- if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
- return false;
- return acct->nr_workers < acct->max_workers;
}
/*
@@ -665,93 +693,11 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
return false;
}
-static void io_wq_check_workers(struct io_wq *wq)
-{
- int node;
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- bool fork_worker[2] = { false, false };
-
- if (!node_online(node))
- continue;
-
- raw_spin_lock_irq(&wqe->lock);
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
- fork_worker[IO_WQ_ACCT_BOUND] = true;
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
- fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- raw_spin_unlock_irq(&wqe->lock);
- if (fork_worker[IO_WQ_ACCT_BOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
- if (fork_worker[IO_WQ_ACCT_UNBOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
- }
-}
-
static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
return true;
}
-static void io_wq_cancel_pending(struct io_wq *wq)
-{
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_all,
- .cancel_all = true,
- };
- int node;
-
- for_each_node(node)
- io_wqe_cancel_pending_work(wq->wqes[node], &match);
-}
-
-/*
- * Manager thread. Tasked with creating new workers, if we need them.
- */
-static int io_wq_manager(void *data)
-{
- struct io_wq *wq = data;
- char buf[TASK_COMM_LEN];
- int node;
-
- snprintf(buf, sizeof(buf), "iou-mgr-%d", wq->task_pid);
- set_task_comm(current, buf);
-
- do {
- set_current_state(TASK_INTERRUPTIBLE);
- io_wq_check_workers(wq);
- schedule_timeout(HZ);
- if (signal_pending(current)) {
- struct ksignal ksig;
-
- if (!get_signal(&ksig))
- continue;
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- }
- } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
-
- io_wq_check_workers(wq);
-
- rcu_read_lock();
- for_each_node(node)
- io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
- rcu_read_unlock();
-
- if (atomic_dec_and_test(&wq->worker_refs))
- complete(&wq->worker_done);
- wait_for_completion(&wq->worker_done);
-
- spin_lock_irq(&wq->hash->wait.lock);
- for_each_node(node)
- list_del_init(&wq->wqes[node]->wait.entry);
- spin_unlock_irq(&wq->hash->wait.lock);
-
- io_wq_cancel_pending(wq);
- complete(&wq->exited);
- do_exit(0);
-}
-
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
struct io_wq *wq = wqe->wq;
@@ -783,39 +729,13 @@ append:
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
-static int io_wq_fork_manager(struct io_wq *wq)
-{
- struct task_struct *tsk;
-
- if (wq->manager)
- return 0;
-
- WARN_ON_ONCE(test_bit(IO_WQ_BIT_EXIT, &wq->state));
-
- init_completion(&wq->worker_done);
- atomic_set(&wq->worker_refs, 1);
- tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE);
- if (!IS_ERR(tsk)) {
- wq->manager = get_task_struct(tsk);
- wake_up_new_task(tsk);
- return 0;
- }
-
- if (atomic_dec_and_test(&wq->worker_refs))
- complete(&wq->worker_done);
-
- return PTR_ERR(tsk);
-}
-
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
- /* Can only happen if manager creation fails after exec */
- if (io_wq_fork_manager(wqe->wq) ||
- test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) {
+ if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) {
io_run_cancel(work, wqe);
return;
}
@@ -970,17 +890,12 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
- int ret;
list_del_init(&wait->entry);
rcu_read_lock();
- ret = io_wqe_activate_free_worker(wqe);
+ io_wqe_activate_free_worker(wqe);
rcu_read_unlock();
-
- if (!ret)
- wake_up_process(wqe->wq->manager);
-
return 1;
}
@@ -1021,6 +936,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
goto err;
wq->wqes[node] = wqe;
wqe->node = alloc_node;
+ wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
+ wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
@@ -1035,13 +952,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
INIT_LIST_HEAD(&wqe->all_list);
}
- wq->task_pid = current->pid;
- init_completion(&wq->exited);
+ wq->task = get_task_struct(data->task);
refcount_set(&wq->refs, 1);
-
- ret = io_wq_fork_manager(wq);
- if (!ret)
- return wq;
+ atomic_set(&wq->worker_refs, 1);
+ init_completion(&wq->worker_done);
+ return wq;
err:
io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
@@ -1054,14 +969,49 @@ err_wq:
return ERR_PTR(ret);
}
-static void io_wq_destroy_manager(struct io_wq *wq)
+static bool io_task_work_match(struct callback_head *cb, void *data)
{
- if (wq->manager) {
- wake_up_process(wq->manager);
- wait_for_completion(&wq->exited);
- put_task_struct(wq->manager);
- wq->manager = NULL;
+ struct create_worker_data *cwd;
+
+ if (cb->func != create_worker_cb)
+ return false;
+ cwd = container_of(cb, struct create_worker_data, work);
+ return cwd->wqe->wq == data;
+}
+
+static void io_wq_exit_workers(struct io_wq *wq)
+{
+ struct callback_head *cb;
+ int node;
+
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+
+ if (!wq->task)
+ return;
+
+ while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
+ struct create_worker_data *cwd;
+
+ cwd = container_of(cb, struct create_worker_data, work);
+ atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+ io_worker_ref_put(wq);
+ kfree(cwd);
+ }
+
+ rcu_read_lock();
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
+ spin_lock_irq(&wq->hash->wait.lock);
+ list_del_init(&wq->wqes[node]->wait.entry);
+ spin_unlock_irq(&wq->hash->wait.lock);
}
+ rcu_read_unlock();
+ io_worker_ref_put(wq);
+ wait_for_completion(&wq->worker_done);
+ put_task_struct(wq->task);
+ wq->task = NULL;
}
static void io_wq_destroy(struct io_wq *wq)
@@ -1070,8 +1020,7 @@ static void io_wq_destroy(struct io_wq *wq)
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- io_wq_destroy_manager(wq);
+ io_wq_exit_workers(wq);
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -1095,21 +1044,14 @@ void io_wq_put(struct io_wq *wq)
void io_wq_put_and_exit(struct io_wq *wq)
{
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- io_wq_destroy_manager(wq);
+ io_wq_exit_workers(wq);
io_wq_put(wq);
}
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
- struct task_struct *task = worker->task;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
- do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
- task->flags |= PF_NO_SETAFFINITY;
- task_rq_unlock(rq, task, &rf);
+ set_cpus_allowed_ptr(worker->task, cpumask_of_node(worker->wqe->node));
+
return false;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 80d590564ff9..0e6d310999e8 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -116,6 +116,7 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
struct io_wq_data {
struct io_wq_hash *hash;
+ struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
diff --git a/fs/io_uring.c b/fs/io_uring.c
index dff34975d86b..360f81395d81 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -194,43 +194,56 @@ enum io_uring_cmd_flags {
struct io_mapped_ubuf {
u64 ubuf;
- size_t len;
- struct bio_vec *bvec;
+ u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
+ struct bio_vec bvec[];
};
struct io_ring_ctx;
+struct io_overflow_cqe {
+ struct io_uring_cqe cqe;
+ struct list_head list;
+};
+
+struct io_fixed_file {
+ /* file * with additional FFS_* flags */
+ unsigned long file_ptr;
+};
+
struct io_rsrc_put {
struct list_head list;
+ u64 tag;
union {
void *rsrc;
struct file *file;
+ struct io_mapped_ubuf *buf;
};
};
-struct fixed_rsrc_table {
- struct file **files;
+struct io_file_table {
+ /* two level table */
+ struct io_fixed_file **files;
};
-struct fixed_rsrc_ref_node {
+struct io_rsrc_node {
struct percpu_ref refs;
struct list_head node;
struct list_head rsrc_list;
- struct fixed_rsrc_data *rsrc_data;
- void (*rsrc_put)(struct io_ring_ctx *ctx,
- struct io_rsrc_put *prsrc);
+ struct io_rsrc_data *rsrc_data;
struct llist_node llist;
bool done;
};
-struct fixed_rsrc_data {
- struct fixed_rsrc_table *table;
+typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+
+struct io_rsrc_data {
struct io_ring_ctx *ctx;
- struct fixed_rsrc_ref_node *node;
- struct percpu_ref refs;
+ u64 *tags;
+ rsrc_put_fn *do_put;
+ atomic_t refs;
struct completion done;
bool quiesce;
};
@@ -330,7 +343,6 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
@@ -388,12 +400,14 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_rsrc_data *file_data;
+ struct io_rsrc_data *file_data;
+ struct io_file_table file_table;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
+ struct io_rsrc_data *buf_data;
unsigned nr_user_bufs;
- struct io_mapped_ubuf *user_bufs;
+ struct io_mapped_ubuf **user_bufs;
struct user_struct *user;
@@ -414,6 +428,7 @@ struct io_ring_ctx {
unsigned cq_mask;
atomic_t cq_timeouts;
unsigned cq_last_tm_flush;
+ unsigned cq_extra;
unsigned long cq_check_overflow;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
@@ -433,23 +448,20 @@ struct io_ring_ctx {
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
-
- spinlock_t inflight_lock;
- struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
struct delayed_work rsrc_put_work;
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+ struct io_rsrc_node *rsrc_node;
+ struct io_rsrc_node *rsrc_backup_node;
struct io_restriction restrictions;
/* exit task_work */
struct callback_head *exit_task_work;
- struct wait_queue_head hash_wait;
-
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
struct list_head tctx_list;
@@ -462,8 +474,8 @@ struct io_uring_task {
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct percpu_counter inflight;
+ atomic_t inflight_tracked;
atomic_t in_idle;
- bool sqpoll;
spinlock_t task_lock;
struct io_wq_work_list task_list;
@@ -484,9 +496,13 @@ struct io_poll_iocb {
struct wait_queue_entry wait;
};
-struct io_poll_remove {
+struct io_poll_update {
struct file *file;
- u64 addr;
+ u64 old_user_data;
+ u64 new_user_data;
+ __poll_t events;
+ bool update_events;
+ bool update_user_data;
};
struct io_close {
@@ -556,8 +572,9 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
union {
- struct user_msghdr __user *umsg;
- void __user *buf;
+ struct compat_msghdr __user *umsg_compat;
+ struct user_msghdr __user *umsg;
+ void __user *buf;
};
int msg_flags;
int bgid;
@@ -614,7 +631,7 @@ struct io_splice {
struct io_provide_buf {
struct file *file;
__u64 addr;
- __s32 len;
+ __u32 len;
__u32 bgid;
__u16 nbufs;
__u16 bid;
@@ -653,7 +670,7 @@ struct io_unlink {
struct io_completion {
struct file *file;
struct list_head list;
- int cflags;
+ u32 cflags;
};
struct io_async_connect {
@@ -690,14 +707,17 @@ enum {
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_ISREG_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
- REQ_F_NO_FILE_TABLE_BIT,
REQ_F_LTIMEOUT_ACTIVE_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
+ REQ_F_DONT_REISSUE_BIT,
+ /* keep async read/write and isreg together and in order */
+ REQ_F_ASYNC_READ_BIT,
+ REQ_F_ASYNC_WRITE_BIT,
+ REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -727,22 +747,26 @@ enum {
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* regular file */
- REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
- /* doesn't need file table for this request */
- REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
+ /* don't attempt request reissue, see io_rw_reissue() */
+ REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
+ /* supports async reads */
+ REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT),
+ /* supports async writes */
+ REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT),
+ /* regular file */
+ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
};
struct async_poll {
@@ -766,7 +790,7 @@ struct io_kiocb {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
- struct io_poll_remove poll_remove;
+ struct io_poll_update poll_update;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
@@ -801,17 +825,14 @@ struct io_kiocb {
struct io_ring_ctx *ctx;
unsigned int flags;
- refcount_t refs;
+ atomic_t refs;
struct task_struct *task;
u64 user_data;
struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
- /*
- * 1. used with ctx->iopoll_list with reads/writes
- * 2. to track reqs with ->files (see io_op_def::file_table)
- */
+ /* used with ctx->iopoll_list with reads/writes */
struct list_head inflight_entry;
union {
struct io_task_work io_task_work;
@@ -821,6 +842,8 @@ struct io_kiocb {
struct hlist_node hash_node;
struct async_poll *apoll;
struct io_wq_work work;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
};
struct io_tctx_node {
@@ -849,8 +872,8 @@ struct io_op_def {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
- /* must always have async data allocated */
- unsigned needs_async_data : 1;
+ /* do prep async if is going to be punted */
+ unsigned needs_async_setup : 1;
/* should block plug */
unsigned plug : 1;
/* size of async data needed, if any */
@@ -864,7 +887,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .needs_async_data = 1,
+ .needs_async_setup = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
},
@@ -873,7 +896,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_async_data = 1,
+ .needs_async_setup = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
},
@@ -907,7 +930,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_async_data = 1,
+ .needs_async_setup = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_RECVMSG] = {
@@ -915,11 +938,10 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .needs_async_data = 1,
+ .needs_async_setup = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
- .needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_TIMEOUT_REMOVE] = {
@@ -932,14 +954,13 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
- .needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_async_data = 1,
+ .needs_async_setup = 1,
.async_size = sizeof(struct io_async_connect),
},
[IORING_OP_FALLOCATE] = {
@@ -1008,40 +1029,31 @@ static void io_uring_del_task_file(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
-static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
-static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
-static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
- struct io_ring_ctx *ctx);
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
-static bool io_rw_reissue(struct io_kiocb *req);
-static void io_cqring_fill_event(struct io_kiocb *req, long res);
+static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
+
+static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+ long res, unsigned int cflags);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req, int nr);
-static void io_double_put_req(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
static void io_put_task(struct task_struct *task, int nr);
-static void io_queue_next(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
-static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_rsrc_update *ip,
- unsigned nr_args);
-static void __io_clean_op(struct io_kiocb *req);
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+ struct io_uring_rsrc_update2 *up,
+ unsigned nr_args);
+static void io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock);
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force);
static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_comp_state *cs,
struct io_ring_ctx *ctx);
+static bool io_poll_remove_waitqs(struct io_kiocb *req);
+static int io_req_prep_async(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -1063,34 +1075,36 @@ EXPORT_SYMBOL(io_uring_get_socket);
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
-static inline void io_clean_op(struct io_kiocb *req)
-{
- if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
- __io_clean_op(req);
-}
-
-static inline void io_set_resource_node(struct io_kiocb *req)
+static inline void io_req_set_rsrc_node(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
if (!req->fixed_rsrc_refs) {
- req->fixed_rsrc_refs = &ctx->file_data->node->refs;
+ req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
percpu_ref_get(req->fixed_rsrc_refs);
}
}
+static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+{
+ bool got = percpu_ref_tryget(ref);
+
+ /* already at zero, wait for ->release() */
+ if (!got)
+ wait_for_completion(compl);
+ percpu_ref_resurrect(ref);
+ if (got)
+ percpu_ref_put(ref);
+}
+
static bool io_match_task(struct io_kiocb *head,
struct task_struct *task,
struct files_struct *files)
{
struct io_kiocb *req;
- if (task && head->task != task) {
- /* in terms of cancelation, always match if req task is dead */
- if (head->task->flags & PF_EXITING)
- return true;
+ if (task && head->task != task)
return false;
- }
if (!files)
return true;
@@ -1103,7 +1117,7 @@ static bool io_match_task(struct io_kiocb *head,
static inline void req_set_fail_links(struct io_kiocb *req)
{
- if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ if (req->flags & REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
}
@@ -1161,8 +1175,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- spin_lock_init(&ctx->inflight_lock);
- INIT_LIST_HEAD(&ctx->inflight_list);
spin_lock_init(&ctx->rsrc_ref_lock);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
@@ -1182,7 +1194,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
struct io_ring_ctx *ctx = req->ctx;
- return seq != ctx->cached_cq_tail
+ return seq + ctx->cq_extra != ctx->cached_cq_tail
+ READ_ONCE(ctx->cached_cq_overflow);
}
@@ -1191,14 +1203,9 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
static void io_req_track_inflight(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_add(&req->inflight_entry, &ctx->inflight_list);
- spin_unlock_irq(&ctx->inflight_lock);
+ atomic_inc(&current->io_uring->inflight_tracked);
}
}
@@ -1210,6 +1217,8 @@ static void io_prep_async_work(struct io_kiocb *req)
if (!req->work.creds)
req->work.creds = get_current_cred();
+ req->work.list.next = NULL;
+ req->work.flags = 0;
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1220,6 +1229,14 @@ static void io_prep_async_work(struct io_kiocb *req)
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+
+ switch (req->opcode) {
+ case IORING_OP_SPLICE:
+ case IORING_OP_TEE:
+ if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
+ break;
+ }
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1249,16 +1266,15 @@ static void io_queue_async_work(struct io_kiocb *req)
}
static void io_kill_timeout(struct io_kiocb *req, int status)
+ __must_hold(&req->ctx->completion_lock)
{
struct io_timeout_data *io = req->async_data;
- int ret;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
+ if (hrtimer_try_to_cancel(&io->timer) != -1) {
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_cqring_fill_event(req, status);
+ io_cqring_fill_event(req->ctx, req->user_data, status, 0);
io_put_req_deferred(req, 1);
}
}
@@ -1336,7 +1352,7 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
-static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned tail;
@@ -1355,13 +1371,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
- if (!ctx->cq_ev_fd)
+ if (likely(!ctx->cq_ev_fd))
return false;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return false;
- if (!ctx->eventfd_async)
- return true;
- return io_wq_current_is_worker();
+ return !ctx->eventfd_async || io_wq_current_is_worker();
}
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
@@ -1399,41 +1413,33 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
}
/* Returns true if there are no backlogged entries after the flush */
-static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
- struct task_struct *tsk,
- struct files_struct *files)
+static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
struct io_rings *rings = ctx->rings;
- struct io_kiocb *req, *tmp;
- struct io_uring_cqe *cqe;
unsigned long flags;
bool all_flushed, posted;
- LIST_HEAD(list);
if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
return false;
posted = false;
spin_lock_irqsave(&ctx->completion_lock, flags);
- list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (!io_match_task(req, tsk, files))
- continue;
+ while (!list_empty(&ctx->cq_overflow_list)) {
+ struct io_uring_cqe *cqe = io_get_cqring(ctx);
+ struct io_overflow_cqe *ocqe;
- cqe = io_get_cqring(ctx);
if (!cqe && !force)
break;
-
- list_move(&req->compl.list, &list);
- if (cqe) {
- WRITE_ONCE(cqe->user_data, req->user_data);
- WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, req->compl.cflags);
- } else {
- ctx->cached_cq_overflow++;
+ ocqe = list_first_entry(&ctx->cq_overflow_list,
+ struct io_overflow_cqe, list);
+ if (cqe)
+ memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+ else
WRITE_ONCE(ctx->rings->cq_overflow,
- ctx->cached_cq_overflow);
- }
+ ++ctx->cached_cq_overflow);
posted = true;
+ list_del(&ocqe->list);
+ kfree(ocqe);
}
all_flushed = list_empty(&ctx->cq_overflow_list);
@@ -1448,19 +1454,10 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (posted)
io_cqring_ev_posted(ctx);
-
- while (!list_empty(&list)) {
- req = list_first_entry(&list, struct io_kiocb, compl.list);
- list_del(&req->compl.list);
- io_put_req(req);
- }
-
return all_flushed;
}
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
- struct task_struct *tsk,
- struct files_struct *files)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
bool ret = true;
@@ -1468,7 +1465,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
- ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
+ ret = __io_cqring_overflow_flush(ctx, force);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
@@ -1476,12 +1473,74 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
return ret;
}
-static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
+/*
+ * Shamelessly stolen from the mm implementation of page reference checking,
+ * see commit f958d7b528b1 for details.
+ */
+#define req_ref_zero_or_close_to_overflow(req) \
+ ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+{
+ return atomic_inc_not_zero(&req->refs);
+}
+
+static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
+{
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ return atomic_sub_and_test(refs, &req->refs);
+}
+
+static inline bool req_ref_put_and_test(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ return atomic_dec_and_test(&req->refs);
+}
+
+static inline void req_ref_put(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(req_ref_put_and_test(req));
+}
+
+static inline void req_ref_get(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ atomic_inc(&req->refs);
+}
+
+static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
+ long res, unsigned int cflags)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ if (!ocqe) {
+ /*
+ * If we're in ring overflow flush mode, or in task cancel mode,
+ * or cannot allocate an overflow entry, then we need to drop it
+ * on the floor.
+ */
+ WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+ return false;
+ }
+ if (list_empty(&ctx->cq_overflow_list)) {
+ set_bit(0, &ctx->sq_check_overflow);
+ set_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ }
+ ocqe->cqe.user_data = user_data;
+ ocqe->cqe.res = res;
+ ocqe->cqe.flags = cflags;
+ list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+ return true;
+}
+
+static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+ long res, unsigned int cflags)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, req->user_data, res);
+ trace_io_uring_complete(ctx, user_data, res, cflags);
/*
* If we can't get a cq entry, userspace overflowed the
@@ -1490,35 +1549,19 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
*/
cqe = io_get_cqring(ctx);
if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, req->user_data);
+ WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed ||
- atomic_read(&req->task->io_uring->in_idle)) {
- /*
- * If we're in ring overflow flush mode, or in task cancel mode,
- * then we cannot store the request for later flushing, we need
- * to drop it on the floor.
- */
- ctx->cached_cq_overflow++;
- WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
- } else {
- if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->sq_check_overflow);
- set_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
- }
- io_clean_op(req);
- req->result = res;
- req->compl.cflags = cflags;
- refcount_inc(&req->refs);
- list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
+ return true;
}
+ return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
-static void io_cqring_fill_event(struct io_kiocb *req, long res)
+/* not as hot to bloat with inlining */
+static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+ long res, unsigned int cflags)
{
- __io_cqring_fill_event(req, res, 0);
+ return __io_cqring_fill_event(ctx, user_data, res, cflags);
}
static void io_req_complete_post(struct io_kiocb *req, long res,
@@ -1528,12 +1571,12 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- __io_cqring_fill_event(req, res, cflags);
+ __io_cqring_fill_event(ctx, req->user_data, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
*/
- if (refcount_dec_and_test(&req->refs)) {
+ if (req_ref_put_and_test(req)) {
struct io_comp_state *cs = &ctx->submit_state.comp;
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
@@ -1561,10 +1604,17 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
}
}
+static inline bool io_req_needs_clean(struct io_kiocb *req)
+{
+ return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
+ REQ_F_POLLED | REQ_F_INFLIGHT);
+}
+
static void io_req_complete_state(struct io_kiocb *req, long res,
unsigned int cflags)
{
- io_clean_op(req);
+ if (io_req_needs_clean(req))
+ io_clean_op(req);
req->result = res;
req->compl.cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
@@ -1584,34 +1634,50 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
__io_req_complete(req, 0, res, 0);
}
+static void io_req_complete_failed(struct io_kiocb *req, long res)
+{
+ req_set_fail_links(req);
+ io_put_req(req);
+ io_req_complete_post(req, res, 0);
+}
+
+static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
+ struct io_comp_state *cs)
+{
+ spin_lock_irq(&ctx->completion_lock);
+ list_splice_init(&cs->locked_free_list, &cs->free_list);
+ cs->locked_free_nr = 0;
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
+/* Returns true IFF there are requests in the cache */
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
struct io_comp_state *cs = &state->comp;
- struct io_kiocb *req = NULL;
+ int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
* locked cache, grab the lock and move them over to our submission
* side cache.
*/
- if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
- spin_lock_irq(&ctx->completion_lock);
- list_splice_init(&cs->locked_free_list, &cs->free_list);
- cs->locked_free_nr = 0;
- spin_unlock_irq(&ctx->completion_lock);
- }
+ if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH)
+ io_flush_cached_locked_reqs(ctx, cs);
+ nr = state->free_reqs;
while (!list_empty(&cs->free_list)) {
- req = list_first_entry(&cs->free_list, struct io_kiocb,
- compl.list);
+ struct io_kiocb *req = list_first_entry(&cs->free_list,
+ struct io_kiocb, compl.list);
+
list_del(&req->compl.list);
- state->reqs[state->free_reqs++] = req;
- if (state->free_reqs == ARRAY_SIZE(state->reqs))
+ state->reqs[nr++] = req;
+ if (nr == ARRAY_SIZE(state->reqs))
break;
}
- return req != NULL;
+ state->free_reqs = nr;
+ return nr != 0;
}
static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
@@ -1647,37 +1713,28 @@ got_req:
return state->reqs[state->free_reqs];
}
-static inline void io_put_file(struct io_kiocb *req, struct file *file,
- bool fixed)
+static inline void io_put_file(struct file *file)
{
- if (!fixed)
+ if (file)
fput(file);
}
static void io_dismantle_req(struct io_kiocb *req)
{
- io_clean_op(req);
+ unsigned int flags = req->flags;
- if (req->async_data)
- kfree(req->async_data);
- if (req->file)
- io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+ if (io_req_needs_clean(req))
+ io_clean_op(req);
+ if (!(flags & REQ_F_FIXED_FILE))
+ io_put_file(req->file);
if (req->fixed_rsrc_refs)
percpu_ref_put(req->fixed_rsrc_refs);
+ if (req->async_data)
+ kfree(req->async_data);
if (req->work.creds) {
put_cred(req->work.creds);
req->work.creds = NULL;
}
-
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- req->flags &= ~REQ_F_INFLIGHT;
- }
}
/* must to be called somewhat shortly after putting a request */
@@ -1714,7 +1771,6 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
struct io_kiocb *link = req->link;
- bool cancelled = false;
/*
* Can happen if a linked timeout fired and link had been like
@@ -1722,19 +1778,17 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
*/
if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
struct io_timeout_data *io = link->async_data;
- int ret;
io_remove_next_linked(req);
link->timeout.head = NULL;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
- io_cqring_fill_event(link, -ECANCELED);
+ if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ io_cqring_fill_event(link->ctx, link->user_data,
+ -ECANCELED, 0);
io_put_req_deferred(link, 1);
- cancelled = true;
+ return true;
}
}
- req->flags &= ~REQ_F_LINK_TIMEOUT;
- return cancelled;
+ return false;
}
static void io_fail_links(struct io_kiocb *req)
@@ -1748,7 +1802,7 @@ static void io_fail_links(struct io_kiocb *req)
link->link = NULL;
trace_io_uring_fail_link(req, link);
- io_cqring_fill_event(link, -ECANCELED);
+ io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
io_put_req_deferred(link, 2);
link = nxt;
}
@@ -1761,7 +1815,8 @@ static bool io_disarm_next(struct io_kiocb *req)
if (likely(req->flags & REQ_F_LINK_TIMEOUT))
posted = io_kill_linked_timeout(req);
- if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+ if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
+ !(req->flags & REQ_F_HARDLINK))) {
posted |= (req->link != NULL);
io_fail_links(req);
}
@@ -1859,13 +1914,17 @@ static void tctx_task_work(struct callback_head *cb)
cond_resched();
}
-static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
- enum task_work_notify_mode notify)
+static int io_req_task_work_add(struct io_kiocb *req)
{
+ struct task_struct *tsk = req->task;
struct io_uring_task *tctx = tsk->io_uring;
+ enum task_work_notify_mode notify;
struct io_wq_work_node *node, *prev;
unsigned long flags;
- int ret;
+ int ret = 0;
+
+ if (unlikely(tsk->flags & PF_EXITING))
+ return -ESRCH;
WARN_ON_ONCE(!tctx);
@@ -1878,14 +1937,23 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
test_and_set_bit(0, &tctx->task_state))
return 0;
- if (!task_work_add(tsk, &tctx->task_work, notify))
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup. For
+ * all other cases, use TWA_SIGNAL unconditionally to ensure we're
+ * processing task_work. There's no reliable way to tell if TWA_RESUME
+ * will do the job.
+ */
+ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
+
+ if (!task_work_add(tsk, &tctx->task_work, notify)) {
+ wake_up_process(tsk);
return 0;
+ }
/*
* Slow path - we failed, find and delete work. if the work is not
* in the list, it got run and we're fine.
*/
- ret = 0;
spin_lock_irqsave(&tctx->task_lock, flags);
wq_list_for_each(node, prev, &tctx->task_list) {
if (&req->io_task_work.node == node) {
@@ -1899,33 +1967,6 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
return ret;
}
-static int io_req_task_work_add(struct io_kiocb *req)
-{
- struct task_struct *tsk = req->task;
- struct io_ring_ctx *ctx = req->ctx;
- enum task_work_notify_mode notify;
- int ret;
-
- if (tsk->flags & PF_EXITING)
- return -ESRCH;
-
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
- */
- notify = TWA_NONE;
- if (!(ctx->flags & IORING_SETUP_SQPOLL))
- notify = TWA_SIGNAL;
-
- ret = io_task_work_add(tsk, req, notify);
- if (!ret)
- wake_up_process(tsk);
-
- return ret;
-}
-
static bool io_run_task_work_head(struct callback_head **work_head)
{
struct callback_head *work, *next;
@@ -1966,29 +2007,15 @@ static void io_req_task_work_add_fallback(struct io_kiocb *req,
io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
}
-static void __io_req_task_cancel(struct io_kiocb *req, int error)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->completion_lock);
- io_cqring_fill_event(req, error);
- io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
- req_set_fail_links(req);
- io_double_put_req(req);
-}
-
static void io_req_task_cancel(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
+ /* ctx is guaranteed to stay alive while we hold uring_lock */
mutex_lock(&ctx->uring_lock);
- __io_req_task_cancel(req, req->result);
+ io_req_complete_failed(req, req->result);
mutex_unlock(&ctx->uring_lock);
- percpu_ref_put(&ctx->refs);
}
static void __io_req_task_submit(struct io_kiocb *req)
@@ -2000,7 +2027,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
if (!(current->flags & PF_EXITING) && !current->in_execve)
__io_queue_sqe(req);
else
- __io_req_task_cancel(req, -EFAULT);
+ io_req_complete_failed(req, -EFAULT);
mutex_unlock(&ctx->uring_lock);
}
@@ -2011,27 +2038,21 @@ static void io_req_task_submit(struct callback_head *cb)
__io_req_task_submit(req);
}
-static void io_req_task_queue(struct io_kiocb *req)
+static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
- int ret;
+ req->result = ret;
+ req->task_work.func = io_req_task_cancel;
- req->task_work.func = io_req_task_submit;
- ret = io_req_task_work_add(req);
- if (unlikely(ret)) {
- req->result = -ECANCELED;
- percpu_ref_get(&req->ctx->refs);
+ if (unlikely(io_req_task_work_add(req)))
io_req_task_work_add_fallback(req, io_req_task_cancel);
- }
}
-static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+static void io_req_task_queue(struct io_kiocb *req)
{
- percpu_ref_get(&req->ctx->refs);
- req->result = ret;
- req->task_work.func = io_req_task_cancel;
+ req->task_work.func = io_req_task_submit;
if (unlikely(io_req_task_work_add(req)))
- io_req_task_work_add_fallback(req, io_req_task_cancel);
+ io_req_task_queue_fail(req, -ECANCELED);
}
static inline void io_queue_next(struct io_kiocb *req)
@@ -2074,6 +2095,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
struct io_submit_state *state)
{
io_queue_next(req);
+ io_dismantle_req(req);
if (req->task != rb->task) {
if (rb->task)
@@ -2084,7 +2106,6 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
rb->task_refs++;
rb->ctx_refs++;
- io_dismantle_req(req);
if (state->free_reqs != ARRAY_SIZE(state->reqs))
state->reqs[state->free_reqs++] = req;
else
@@ -2102,7 +2123,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < nr; i++) {
req = cs->reqs[i];
- __io_cqring_fill_event(req, req->result, req->compl.cflags);
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ req->compl.cflags);
}
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
@@ -2112,7 +2134,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
req = cs->reqs[i];
/* submission and completion refs */
- if (refcount_sub_and_test(2, &req->refs))
+ if (req_ref_sub_and_test(req, 2))
io_req_free_batch(&rb, req, &ctx->submit_state);
}
@@ -2124,20 +2146,20 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
-static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = NULL;
- if (refcount_dec_and_test(&req->refs)) {
+ if (req_ref_put_and_test(req)) {
nxt = io_req_find_next(req);
__io_free_req(req);
}
return nxt;
}
-static void io_put_req(struct io_kiocb *req)
+static inline void io_put_req(struct io_kiocb *req)
{
- if (refcount_dec_and_test(&req->refs))
+ if (req_ref_put_and_test(req))
io_free_req(req);
}
@@ -2150,27 +2172,17 @@ static void io_put_req_deferred_cb(struct callback_head *cb)
static void io_free_req_deferred(struct io_kiocb *req)
{
- int ret;
-
req->task_work.func = io_put_req_deferred_cb;
- ret = io_req_task_work_add(req);
- if (unlikely(ret))
+ if (unlikely(io_req_task_work_add(req)))
io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
}
static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
{
- if (refcount_sub_and_test(refs, &req->refs))
+ if (req_ref_sub_and_test(req, refs))
io_free_req_deferred(req);
}
-static void io_double_put_req(struct io_kiocb *req)
-{
- /* drop both submit and complete references */
- if (refcount_sub_and_test(2, &req->refs))
- io_free_req(req);
-}
-
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{
/* See comment at the top of this file */
@@ -2241,19 +2253,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
- if (READ_ONCE(req->result) == -EAGAIN) {
+ if (READ_ONCE(req->result) == -EAGAIN &&
+ !(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
- if (io_rw_reissue(req))
- continue;
+ req_ref_get(req);
+ io_queue_async_work(req);
+ continue;
}
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
- __io_cqring_fill_event(req, req->result, cflags);
+ __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
(*nr_events)++;
- if (refcount_dec_and_test(&req->refs))
+ if (req_ref_put_and_test(req))
io_req_free_batch(&rb, req, &ctx->submit_state);
}
@@ -2312,27 +2326,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
/*
- * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
- * non-spinning poll check - we'll still enter the driver poll loop, but only
- * as a non-spinning completion check.
- */
-static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
- int ret;
-
- ret = io_do_iopoll(ctx, nr_events, min);
- if (ret < 0)
- return ret;
- if (*nr_events >= min)
- return 0;
- }
-
- return 1;
-}
-
-/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
@@ -2367,7 +2360,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
unsigned int nr_events = 0;
- int iters = 0, ret = 0;
+ int ret = 0;
/*
* We disallow the app entering submit/complete with polling, but we
@@ -2375,18 +2368,17 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* that got punted to a workqueue.
*/
mutex_lock(&ctx->uring_lock);
+ /*
+ * Don't enter poll loop if we already have events pending.
+ * If we do, we can potentially be spinning for commands that
+ * already triggered a CQE (eg in error).
+ */
+ if (test_bit(0, &ctx->cq_check_overflow))
+ __io_cqring_overflow_flush(ctx, false);
+ if (io_cqring_events(ctx))
+ goto out;
do {
/*
- * Don't enter poll loop if we already have events pending.
- * If we do, we can potentially be spinning for commands that
- * already triggered a CQE (eg in error).
- */
- if (test_bit(0, &ctx->cq_check_overflow))
- __io_cqring_overflow_flush(ctx, false, NULL, NULL);
- if (io_cqring_events(ctx))
- break;
-
- /*
* If a submit got punted to a workqueue, we can have the
* application entering polling for a command before it gets
* issued. That app will hold the uring_lock for the duration
@@ -2396,18 +2388,17 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (!(++iters & 7)) {
+ if (list_empty(&ctx->iopoll_list)) {
mutex_unlock(&ctx->uring_lock);
io_run_task_work();
mutex_lock(&ctx->uring_lock);
- }
-
- ret = io_iopoll_getevents(ctx, &nr_events, min);
- if (ret <= 0)
- break;
- ret = 0;
- } while (min && !nr_events && !need_resched());
+ if (list_empty(&ctx->iopoll_list))
+ break;
+ }
+ ret = io_do_iopoll(ctx, &nr_events, min);
+ } while (!ret && nr_events < min && !need_resched());
+out:
mutex_unlock(&ctx->uring_lock);
return ret;
}
@@ -2419,45 +2410,23 @@ static void kiocb_end_write(struct io_kiocb *req)
* thread.
*/
if (req->flags & REQ_F_ISREG) {
- struct inode *inode = file_inode(req->file);
+ struct super_block *sb = file_inode(req->file)->i_sb;
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ __sb_writers_acquired(sb, SB_FREEZE_WRITE);
+ sb_end_write(sb);
}
- file_end_write(req->file);
}
#ifdef CONFIG_BLOCK
static bool io_resubmit_prep(struct io_kiocb *req)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
- int rw, ret;
- struct iov_iter iter;
-
- /* already prepared */
- if (req->async_data)
- return true;
-
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- rw = READ;
- break;
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- rw = WRITE;
- break;
- default:
- printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
- req->opcode);
- return false;
- }
+ struct io_async_rw *rw = req->async_data;
- ret = io_import_iovec(rw, req, &iovec, &iter, false);
- if (ret < 0)
- return false;
- return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
+ if (!rw)
+ return !io_req_prep_async(req);
+ /* may have left rw->iter inconsistent on -EIOCBQUEUED */
+ iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
+ return true;
}
static bool io_rw_should_reissue(struct io_kiocb *req)
@@ -2480,29 +2449,15 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
return true;
}
#else
-static bool io_rw_should_reissue(struct io_kiocb *req)
+static bool io_resubmit_prep(struct io_kiocb *req)
{
return false;
}
-#endif
-
-static bool io_rw_reissue(struct io_kiocb *req)
+static bool io_rw_should_reissue(struct io_kiocb *req)
{
-#ifdef CONFIG_BLOCK
- if (!io_rw_should_reissue(req))
- return false;
-
- lockdep_assert_held(&req->ctx->uring_lock);
-
- if (io_resubmit_prep(req)) {
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- return true;
- }
- req_set_fail_links(req);
-#endif
return false;
}
+#endif
static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
unsigned int issue_flags)
@@ -2511,12 +2466,14 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
if (req->rw.kiocb.ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
- return;
- }
- if (res != req->result)
+ if (res != req->result) {
+ if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
+ io_rw_should_reissue(req)) {
+ req->flags |= REQ_F_REISSUE;
+ return;
+ }
req_set_fail_links(req);
+ }
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
__io_req_complete(req, issue_flags, res, cflags);
@@ -2533,27 +2490,18 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-#ifdef CONFIG_BLOCK
- /* Rewind iter, if we have one. iopoll path resubmits as usual */
- if (res == -EAGAIN && io_rw_should_reissue(req)) {
- struct io_async_rw *rw = req->async_data;
-
- if (rw)
- iov_iter_revert(&rw->iter,
- req->result - iov_iter_count(&rw->iter));
- else if (!io_resubmit_prep(req))
- res = -EIO;
- }
-#endif
-
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
-
- if (res != -EAGAIN && res != req->result)
- req_set_fail_links(req);
+ if (unlikely(res != req->result)) {
+ if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
+ io_resubmit_prep(req))) {
+ req_set_fail_links(req);
+ req->flags |= REQ_F_DONT_REISSUE;
+ }
+ }
WRITE_ONCE(req->result, res);
- /* order with io_poll_complete() checking ->result */
+ /* order with io_iopoll_complete() checking ->result */
smp_wmb();
WRITE_ONCE(req->iopoll_completed, 1);
}
@@ -2561,7 +2509,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
/*
* After the iocb has been issued, it's safe to be found on the poll list.
* Adding the kiocb to the list AFTER submission ensures that we don't
- * find it from a io_iopoll_getevents() thread before the issuer is done
+ * find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
@@ -2647,7 +2595,7 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool io_file_supports_async(struct file *file, int rw)
+static bool __io_file_supports_async(struct file *file, int rw)
{
umode_t mode = file_inode(file)->i_mode;
@@ -2680,6 +2628,16 @@ static bool io_file_supports_async(struct file *file, int rw)
return file->f_op->write_iter != NULL;
}
+static bool io_file_supports_async(struct io_kiocb *req, int rw)
+{
+ if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
+ return true;
+ else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
+ return true;
+
+ return __io_file_supports_async(req->file, rw);
+}
+
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2688,7 +2646,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
unsigned ioprio;
int ret;
- if (S_ISREG(file_inode(file)->i_mode))
+ if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
@@ -2730,6 +2688,12 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kiocb->ki_complete = io_complete_rw;
}
+ if (req->opcode == IORING_OP_READ_FIXED ||
+ req->opcode == IORING_OP_WRITE_FIXED) {
+ req->imu = NULL;
+ io_req_set_rsrc_node(req);
+ }
+
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2781,7 +2745,10 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (check_reissue && req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (!io_rw_reissue(req)) {
+ if (io_resubmit_prep(req)) {
+ req_ref_get(req);
+ io_queue_async_work(req);
+ } else {
int cflags = 0;
req_set_fail_links(req);
@@ -2792,26 +2759,17 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
}
}
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+ struct io_mapped_ubuf *imu)
{
- struct io_ring_ctx *ctx = req->ctx;
size_t len = req->rw.len;
- struct io_mapped_ubuf *imu;
- u16 index, buf_index = req->buf_index;
+ u64 buf_end, buf_addr = req->rw.addr;
size_t offset;
- u64 buf_addr;
-
- if (unlikely(buf_index >= ctx->nr_user_bufs))
- return -EFAULT;
- index = array_index_nospec(buf_index, ctx->nr_user_bufs);
- imu = &ctx->user_bufs[index];
- buf_addr = req->rw.addr;
- /* overflow */
- if (buf_addr + len < buf_addr)
+ if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
return -EFAULT;
/* not inside the mapped region */
- if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+ if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
return -EFAULT;
/*
@@ -2859,6 +2817,22 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
return 0;
}
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_mapped_ubuf *imu = req->imu;
+ u16 index, buf_index = req->buf_index;
+
+ if (likely(!imu)) {
+ if (unlikely(buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+ index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+ imu = READ_ONCE(ctx->user_bufs[index]);
+ req->imu = imu;
+ }
+ return __io_import_fixed(req, rw, iter, imu);
+}
+
static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
{
if (needs_lock)
@@ -3126,29 +3100,21 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
}
}
-static inline int __io_alloc_async_data(struct io_kiocb *req)
+static inline int io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
return req->async_data == NULL;
}
-static int io_alloc_async_data(struct io_kiocb *req)
-{
- if (!io_op_defs[req->opcode].needs_async_data)
- return 0;
-
- return __io_alloc_async_data(req);
-}
-
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
const struct iovec *fast_iov,
struct iov_iter *iter, bool force)
{
- if (!force && !io_op_defs[req->opcode].needs_async_data)
+ if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
if (!req->async_data) {
- if (__io_alloc_async_data(req)) {
+ if (io_alloc_async_data(req)) {
kfree(iovec);
return -ENOMEM;
}
@@ -3208,7 +3174,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
list_del_init(&wait->entry);
/* submit ref gets dropped, acquire a new one */
- refcount_inc(&req->refs);
+ req_ref_get(req);
io_req_task_queue(req);
return 1;
}
@@ -3293,7 +3259,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags |= IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+ if (force_nonblock && !io_file_supports_async(req, READ)) {
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
return ret ?: -EAGAIN;
}
@@ -3398,7 +3364,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags |= IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req->file, WRITE))
+ if (force_nonblock && !io_file_supports_async(req, WRITE))
goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */
@@ -3617,15 +3583,6 @@ static int __io_splice_prep(struct io_kiocb *req,
if (!sp->file_in)
return -EBADF;
req->flags |= REQ_F_NEED_CLEANUP;
-
- if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
- /*
- * Splice operation will be punted aync, and here need to
- * modify io_wq_work.flags, so initialize io_wq_work firstly.
- */
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- }
-
return 0;
}
@@ -3650,7 +3607,8 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
if (sp->len)
ret = do_tee(in, out, sp->len, flags);
- io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+ io_put_file(in);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret != sp->len)
@@ -3686,7 +3644,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
if (sp->len)
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
- io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+ io_put_file(in);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret != sp->len)
@@ -3892,7 +3851,7 @@ err:
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -3965,21 +3924,16 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail_links(req);
- /* need to hold the lock to complete IOPOLL requests */
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- __io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
- } else {
- io_ring_submit_unlock(ctx, !force_nonblock);
- __io_req_complete(req, issue_flags, ret, 0);
- }
+ /* complete before unlock, IOPOLL may need the lock */
+ __io_req_complete(req, issue_flags, ret, 0);
+ io_ring_submit_unlock(ctx, !force_nonblock);
return 0;
}
static int io_provide_buffers_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- unsigned long size;
+ unsigned long size, tmp_check;
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
@@ -3993,6 +3947,12 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
+ if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
+ &size))
+ return -EOVERFLOW;
+ if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
+ return -EOVERFLOW;
+
size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
@@ -4054,15 +4014,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
}
if (ret < 0)
req_set_fail_links(req);
-
- /* need to hold the lock to complete IOPOLL requests */
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- __io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
- } else {
- io_ring_submit_unlock(ctx, !force_nonblock);
- __io_req_complete(req, issue_flags, ret, 0);
- }
+ /* complete before unlock, IOPOLL may need the lock */
+ __io_req_complete(req, issue_flags, ret, 0);
+ io_ring_submit_unlock(ctx, !force_nonblock);
return 0;
}
@@ -4181,7 +4135,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail_links(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4208,12 +4162,8 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
struct io_statx *ctx = &req->statx;
int ret;
- if (issue_flags & IO_URING_F_NONBLOCK) {
- /* only need file table for an actual valid fd */
- if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
- req->flags |= REQ_F_NO_FILE_TABLE;
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
- }
ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
ctx->buffer);
@@ -4243,11 +4193,9 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
struct files_struct *files = current->files;
struct io_close *close = &req->close;
struct fdtable *fdt;
- struct file *file;
- int ret;
+ struct file *file = NULL;
+ int ret = -EBADF;
- file = NULL;
- ret = -EBADF;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (close->fd >= fdt->max_fds) {
@@ -4255,12 +4203,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
goto err;
}
file = fdt->fd[close->fd];
- if (!file) {
- spin_unlock(&files->file_lock);
- goto err;
- }
-
- if (file->f_op == &io_uring_fops) {
+ if (!file || file->f_op == &io_uring_fops) {
spin_unlock(&files->file_lock);
file = NULL;
goto err;
@@ -4358,8 +4301,6 @@ static int io_sendmsg_prep_async(struct io_kiocb *req)
{
int ret;
- if (!io_op_defs[req->opcode].needs_async_data)
- return 0;
ret = io_sendmsg_copy_hdr(req, req->async_data);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
@@ -4373,9 +4314,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ if (sr->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -4404,12 +4347,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (issue_flags & IO_URING_F_NONBLOCK)
+ flags = req->sr_msg.msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
@@ -4452,12 +4392,9 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_controllen = 0;
msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (issue_flags & IO_URING_F_NONBLOCK)
+ flags = req->sr_msg.msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
@@ -4510,16 +4447,14 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
- struct compat_msghdr __user *msg_compat;
struct io_sr_msg *sr = &req->sr_msg;
struct compat_iovec __user *uiov;
compat_uptr_t ptr;
compat_size_t len;
int ret;
- msg_compat = (struct compat_msghdr __user *) sr->umsg;
- ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
- &ptr, &len);
+ ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
+ &ptr, &len);
if (ret)
return ret;
@@ -4587,8 +4522,6 @@ static int io_recvmsg_prep_async(struct io_kiocb *req)
{
int ret;
- if (!io_op_defs[req->opcode].needs_async_data)
- return 0;
ret = io_recvmsg_copy_hdr(req, req->async_data);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
@@ -4602,10 +4535,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->bgid = READ_ONCE(sqe->buf_group);
+ sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ if (sr->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -4646,12 +4581,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1, req->sr_msg.len);
}
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
+ flags = req->sr_msg.msg_flags;
+ if (force_nonblock)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
@@ -4709,12 +4641,9 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_iocb = NULL;
msg.msg_flags = 0;
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
+ flags = req->sr_msg.msg_flags;
+ if (force_nonblock)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
@@ -4884,7 +4813,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
req->task_work.func = func;
- percpu_ref_get(&req->ctx->refs);
/*
* If this fails, then the task is exiting. When a task exits, the
@@ -4936,6 +4864,7 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
}
static void io_poll_remove_double(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
{
struct io_poll_iocb *poll = io_poll_get_double(req);
@@ -4947,20 +4876,37 @@ static void io_poll_remove_double(struct io_kiocb *req)
spin_lock(&head->lock);
list_del_init(&poll->wait.entry);
if (poll->wait.private)
- refcount_dec(&req->refs);
+ req_ref_put(req);
poll->head = NULL;
spin_unlock(&head->lock);
}
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
+ __must_hold(&req->ctx->completion_lock)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags = IORING_CQE_F_MORE;
+ int error;
+
+ if (READ_ONCE(req->poll.canceled)) {
+ error = -ECANCELED;
+ req->poll.events |= EPOLLONESHOT;
+ } else {
+ error = mangle_poll(mask);
+ }
+ if (req->poll.events & EPOLLONESHOT)
+ flags = 0;
+ if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
+ io_poll_remove_waitqs(req);
+ req->poll.done = true;
+ flags = 0;
+ }
+ if (flags & IORING_CQE_F_MORE)
+ ctx->cq_extra++;
- io_poll_remove_double(req);
- req->poll.done = true;
- io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
+ return !(flags & IORING_CQE_F_MORE);
}
static void io_poll_task_func(struct callback_head *cb)
@@ -4972,17 +4918,24 @@ static void io_poll_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
} else {
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
- spin_unlock_irq(&ctx->completion_lock);
+ bool done;
- nxt = io_put_req_find_next(req);
+ done = io_poll_complete(req, req->result);
+ if (done) {
+ hash_del(&req->hash_node);
+ } else {
+ req->result = 0;
+ add_wait_queue(req->poll.head, &req->poll.wait);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- if (nxt)
- __io_req_task_submit(nxt);
- }
- percpu_ref_put(&ctx->refs);
+ if (done) {
+ nxt = io_put_req_find_next(req);
+ if (nxt)
+ __io_req_task_submit(nxt);
+ }
+ }
}
static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
@@ -4995,6 +4948,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
+ if (!(poll->events & EPOLLONESHOT))
+ return poll->wait.func(&poll->wait, mode, sync, key);
list_del_init(&wait->entry);
@@ -5013,7 +4968,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
poll->wait.func(&poll->wait, mode, sync, key);
}
}
- refcount_dec(&req->refs);
+ req_ref_put(req);
return 1;
}
@@ -5023,7 +4978,9 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
poll->head = NULL;
poll->done = false;
poll->canceled = false;
- poll->events = events;
+#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+ /* mask in events that we always want/need */
+ poll->events = events | IO_POLL_UNMASK;
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func);
}
@@ -5047,6 +5004,12 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -EINVAL;
return;
}
+ /*
+ * Can't handle multishot for double wait for now, turn it
+ * into one-shot mode.
+ */
+ if (!(req->poll.events & EPOLLONESHOT))
+ req->poll.events |= EPOLLONESHOT;
/* double add on the same waitqueue head, ignore */
if (poll->head == head)
return;
@@ -5056,7 +5019,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
return;
}
io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
- refcount_inc(&req->refs);
+ req_ref_get(req);
poll->wait.private = req;
*poll_ptr = poll;
}
@@ -5089,25 +5052,17 @@ static void io_async_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
- percpu_ref_put(&ctx->refs);
return;
}
- /* If req is still hashed, it cannot have been canceled. Don't check. */
- if (hash_hashed(&req->hash_node))
- hash_del(&req->hash_node);
-
+ hash_del(&req->hash_node);
io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
__io_req_task_submit(req);
else
- __io_req_task_cancel(req, -ECANCELED);
-
- percpu_ref_put(&ctx->refs);
- kfree(apoll->double_poll);
- kfree(apoll);
+ io_req_complete_failed(req, -ECANCELED);
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -5160,7 +5115,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
ipt->error = 0;
mask = 0;
}
- if (mask || ipt->error)
+ if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
list_del_init(&poll->wait.entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
@@ -5192,7 +5147,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
else
return false;
/* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_async(req->file, rw))
+ if (!io_file_supports_async(req, rw))
return false;
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -5203,7 +5158,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
req->apoll = apoll;
- mask = 0;
+ mask = EPOLLONESHOT;
if (def->pollin)
mask |= POLLIN | POLLRDNORM;
if (def->pollout)
@@ -5223,8 +5178,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
if (ret || ipt.error) {
io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
- kfree(apoll->double_poll);
- kfree(apoll);
return false;
}
spin_unlock_irq(&ctx->completion_lock);
@@ -5234,12 +5187,16 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
}
static bool __io_poll_remove_one(struct io_kiocb *req,
- struct io_poll_iocb *poll)
+ struct io_poll_iocb *poll, bool do_cancel)
+ __must_hold(&req->ctx->completion_lock)
{
bool do_complete = false;
+ if (!poll->head)
+ return false;
spin_lock(&poll->head->lock);
- WRITE_ONCE(poll->canceled, true);
+ if (do_cancel)
+ WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
do_complete = true;
@@ -5249,28 +5206,29 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
return do_complete;
}
-static bool io_poll_remove_one(struct io_kiocb *req)
+static bool io_poll_remove_waitqs(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
{
bool do_complete;
io_poll_remove_double(req);
+ do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
- if (req->opcode == IORING_OP_POLL_ADD) {
- do_complete = __io_poll_remove_one(req, &req->poll);
- } else {
- struct async_poll *apoll = req->apoll;
-
+ if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
/* non-poll requests have submit ref still */
- do_complete = __io_poll_remove_one(req, &apoll->poll);
- if (do_complete) {
- io_put_req(req);
- kfree(apoll->double_poll);
- kfree(apoll);
- }
+ req_ref_put(req);
}
+ return do_complete;
+}
+static bool io_poll_remove_one(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
+{
+ bool do_complete;
+
+ do_complete = io_poll_remove_waitqs(req);
if (do_complete) {
- io_cqring_fill_event(req, -ECANCELED);
+ io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
io_commit_cqring(req->ctx);
req_set_fail_links(req);
io_put_req_deferred(req, 1);
@@ -5307,7 +5265,9 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
return posted != 0;
}
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
+ bool poll_only)
+ __must_hold(&ctx->completion_lock)
{
struct hlist_head *list;
struct io_kiocb *req;
@@ -5316,43 +5276,72 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
hlist_for_each_entry(req, list, hash_node) {
if (sqe_addr != req->user_data)
continue;
- if (io_poll_remove_one(req))
- return 0;
- return -EALREADY;
+ if (poll_only && req->opcode != IORING_OP_POLL_ADD)
+ continue;
+ return req;
}
+ return NULL;
+}
+
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
+ bool poll_only)
+ __must_hold(&ctx->completion_lock)
+{
+ struct io_kiocb *req;
- return -ENOENT;
+ req = io_poll_find(ctx, sqe_addr, poll_only);
+ if (!req)
+ return -ENOENT;
+ if (io_poll_remove_one(req))
+ return 0;
+
+ return -EALREADY;
}
-static int io_poll_remove_prep(struct io_kiocb *req,
+static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
+ unsigned int flags)
+{
+ u32 events;
+
+ events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+ events = swahw32(events);
+#endif
+ if (!(flags & IORING_POLL_ADD_MULTI))
+ events |= EPOLLONESHOT;
+ return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+}
+
+static int io_poll_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_poll_update *upd = &req->poll_update;
+ u32 flags;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->poll_events)
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->len);
+ if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
+ IORING_POLL_ADD_MULTI))
+ return -EINVAL;
+ /* meaningless without update */
+ if (flags == IORING_POLL_ADD_MULTI)
return -EINVAL;
- req->poll_remove.addr = READ_ONCE(sqe->addr);
- return 0;
-}
-
-/*
- * Find a running poll command that matches one specified in sqe->addr,
- * and remove it if found.
- */
-static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
+ upd->old_user_data = READ_ONCE(sqe->addr);
+ upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+ upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
- spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, req->poll_remove.addr);
- spin_unlock_irq(&ctx->completion_lock);
+ upd->new_user_data = READ_ONCE(sqe->off);
+ if (!upd->update_user_data && upd->new_user_data)
+ return -EINVAL;
+ if (upd->update_events)
+ upd->events = io_poll_parse_events(sqe, flags);
+ else if (sqe->poll32_events)
+ return -EINVAL;
- if (ret < 0)
- req_set_fail_links(req);
- io_req_complete(req, ret);
return 0;
}
@@ -5376,19 +5365,17 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
- u32 events;
+ u32 flags;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->len);
+ if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
- events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
- events = swahw32(events);
-#endif
- poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
- (events & EPOLLEXCLUSIVE);
+ poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -5406,17 +5393,80 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(req, mask, 0);
+ io_poll_complete(req, mask);
}
spin_unlock_irq(&ctx->completion_lock);
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req(req);
+ if (poll->events & EPOLLONESHOT)
+ io_put_req(req);
}
return ipt.error;
}
+static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *preq;
+ bool completing;
+ int ret;
+
+ spin_lock_irq(&ctx->completion_lock);
+ preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
+ if (!preq) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
+ completing = true;
+ ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
+ goto err;
+ }
+
+ /*
+ * Don't allow racy completion with singleshot, as we cannot safely
+ * update those. For multishot, if we're racing with completion, just
+ * let completion re-add it.
+ */
+ completing = !__io_poll_remove_one(preq, &preq->poll, false);
+ if (completing && (preq->poll.events & EPOLLONESHOT)) {
+ ret = -EALREADY;
+ goto err;
+ }
+ /* we now have a detached poll request. reissue. */
+ ret = 0;
+err:
+ if (ret < 0) {
+ spin_unlock_irq(&ctx->completion_lock);
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+ }
+ /* only mask one event flags, keep behavior flags */
+ if (req->poll_update.update_events) {
+ preq->poll.events &= ~0xffff;
+ preq->poll.events |= req->poll_update.events & 0xffff;
+ preq->poll.events |= IO_POLL_UNMASK;
+ }
+ if (req->poll_update.update_user_data)
+ preq->user_data = req->poll_update.new_user_data;
+ spin_unlock_irq(&ctx->completion_lock);
+
+ /* complete update request, we're done with it */
+ io_req_complete(req, ret);
+
+ if (!completing) {
+ ret = io_poll_add(preq, issue_flags);
+ if (ret < 0) {
+ req_set_fail_links(preq);
+ io_req_complete(preq, ret);
+ }
+ }
+ return 0;
+}
+
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
@@ -5430,7 +5480,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
- io_cqring_fill_event(req, -ETIME);
+ io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -5442,30 +5492,29 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
__u64 user_data)
+ __must_hold(&ctx->completion_lock)
{
struct io_timeout_data *io;
struct io_kiocb *req;
- int ret = -ENOENT;
+ bool found = false;
list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (user_data == req->user_data) {
- ret = 0;
+ found = user_data == req->user_data;
+ if (found)
break;
- }
}
-
- if (ret == -ENOENT)
- return ERR_PTR(ret);
+ if (!found)
+ return ERR_PTR(-ENOENT);
io = req->async_data;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret == -1)
+ if (hrtimer_try_to_cancel(&io->timer) == -1)
return ERR_PTR(-EALREADY);
list_del_init(&req->timeout.list);
return req;
}
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+ __must_hold(&ctx->completion_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
@@ -5473,13 +5522,14 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
return PTR_ERR(req);
req_set_fail_links(req);
- io_cqring_fill_event(req, -ECANCELED);
+ io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
io_put_req_deferred(req, 1);
return 0;
}
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
+ __must_hold(&ctx->completion_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
struct io_timeout_data *data;
@@ -5545,7 +5595,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
ret = io_timeout_update(ctx, tr->addr, &tr->ts,
io_translate_timeout_mode(tr->flags));
- io_cqring_fill_event(req, ret);
+ io_cqring_fill_event(ctx, req->user_data, ret, 0);
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -5687,27 +5737,23 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
int ret;
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- if (ret != -ENOENT) {
- spin_lock_irqsave(&ctx->completion_lock, flags);
- goto done;
- }
-
spin_lock_irqsave(&ctx->completion_lock, flags);
+ if (ret != -ENOENT)
+ goto done;
ret = io_timeout_cancel(ctx, sqe_addr);
if (ret != -ENOENT)
goto done;
- ret = io_poll_cancel(ctx, sqe_addr);
+ ret = io_poll_cancel(ctx, sqe_addr, false);
done:
if (!ret)
ret = success_ret;
- io_cqring_fill_event(req, ret);
+ io_cqring_fill_event(ctx, req->user_data, ret, 0);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
}
static int io_async_cancel_prep(struct io_kiocb *req,
@@ -5739,7 +5785,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
ret = io_timeout_cancel(ctx, sqe_addr);
if (ret != -ENOENT)
goto done;
- ret = io_poll_cancel(ctx, sqe_addr);
+ ret = io_poll_cancel(ctx, sqe_addr, false);
if (ret != -ENOENT)
goto done;
spin_unlock_irq(&ctx->completion_lock);
@@ -5750,8 +5796,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
- if (!tctx || !tctx->io_wq)
- continue;
ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
if (ret != -ENOENT)
break;
@@ -5760,7 +5804,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
spin_lock_irq(&ctx->completion_lock);
done:
- io_cqring_fill_event(req, ret);
+ io_cqring_fill_event(ctx, req->user_data, ret, 0);
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -5792,7 +5836,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_rsrc_update up;
+ struct io_uring_rsrc_update2 up;
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
@@ -5800,9 +5844,13 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.offset = req->rsrc_update.offset;
up.data = req->rsrc_update.arg;
+ up.nr = 0;
+ up.tags = 0;
+ up.resv = 0;
mutex_lock(&ctx->uring_lock);
- ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
+ ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+ &up, req->rsrc_update.nr_args);
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
@@ -5827,7 +5875,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_POLL_ADD:
return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
- return io_poll_remove_prep(req, sqe);
+ return io_poll_update_prep(req, sqe);
case IORING_OP_FSYNC:
return io_fsync_prep(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
@@ -5886,42 +5934,33 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
- return-EINVAL;
+ return -EINVAL;
}
static int io_req_prep_async(struct io_kiocb *req)
{
+ if (!io_op_defs[req->opcode].needs_async_setup)
+ return 0;
+ if (WARN_ON_ONCE(req->async_data))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
+ return -EAGAIN;
+
switch (req->opcode) {
case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
return io_rw_prep_async(req, READ);
case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
return io_rw_prep_async(req, WRITE);
case IORING_OP_SENDMSG:
- case IORING_OP_SEND:
return io_sendmsg_prep_async(req);
case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
return io_recvmsg_prep_async(req);
case IORING_OP_CONNECT:
return io_connect_prep_async(req);
}
- return 0;
-}
-
-static int io_req_defer_prep(struct io_kiocb *req)
-{
- if (!io_op_defs[req->opcode].needs_async_data)
- return 0;
- /* some opcodes init it during the inital prep */
- if (req->async_data)
- return 0;
- if (__io_alloc_async_data(req))
- return -EAGAIN;
- return io_req_prep_async(req);
+ printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
+ req->opcode);
+ return -EFAULT;
}
static u32 io_get_sequence(struct io_kiocb *req)
@@ -5954,7 +5993,7 @@ static int io_req_defer(struct io_kiocb *req)
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
- ret = io_req_defer_prep(req);
+ ret = io_req_prep_async(req);
if (ret)
return ret;
io_prep_async_link(req);
@@ -5978,7 +6017,7 @@ static int io_req_defer(struct io_kiocb *req)
return -EIOCBQUEUED;
}
-static void __io_clean_op(struct io_kiocb *req)
+static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
switch (req->opcode) {
@@ -6017,8 +6056,8 @@ static void __io_clean_op(struct io_kiocb *req)
}
case IORING_OP_SPLICE:
case IORING_OP_TEE:
- io_put_file(req, req->splice.file_in,
- (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+ if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
+ io_put_file(req->splice.file_in);
break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
@@ -6035,6 +6074,17 @@ static void __io_clean_op(struct io_kiocb *req)
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ kfree(req->apoll->double_poll);
+ kfree(req->apoll);
+ req->apoll = NULL;
+ }
+ if (req->flags & REQ_F_INFLIGHT) {
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ atomic_dec(&tctx->inflight_tracked);
+ req->flags &= ~REQ_F_INFLIGHT;
+ }
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
@@ -6067,7 +6117,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
ret = io_poll_add(req, issue_flags);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, issue_flags);
+ ret = io_poll_update(req, issue_flags);
break;
case IORING_OP_SYNC_FILE_RANGE:
ret = io_sync_file_range(req, issue_flags);
@@ -6205,18 +6255,48 @@ static void io_wq_submit_work(struct io_wq_work *work)
/* avoid locking problems by failing it from a clean context */
if (ret) {
/* io-wq is going to take one down */
- refcount_inc(&req->refs);
+ req_ref_get(req);
io_req_task_queue_fail(req, ret);
}
}
+#define FFS_ASYNC_READ 0x1UL
+#define FFS_ASYNC_WRITE 0x2UL
+#ifdef CONFIG_64BIT
+#define FFS_ISREG 0x4UL
+#else
+#define FFS_ISREG 0x0UL
+#endif
+#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+
+static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
+ unsigned i)
+{
+ struct io_fixed_file *table_l2;
+
+ table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
+ return &table_l2[i & IORING_FILE_TABLE_MASK];
+}
+
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
- struct fixed_rsrc_table *table;
+ struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
- table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
- return table->files[index & IORING_FILE_TABLE_MASK];
+ return (struct file *) (slot->file_ptr & FFS_MASK);
+}
+
+static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
+{
+ unsigned long file_ptr = (unsigned long) file;
+
+ if (__io_file_supports_async(file, READ))
+ file_ptr |= FFS_ASYNC_READ;
+ if (__io_file_supports_async(file, WRITE))
+ file_ptr |= FFS_ASYNC_WRITE;
+ if (S_ISREG(file_inode(file)->i_mode))
+ file_ptr |= FFS_ISREG;
+ file_slot->file_ptr = file_ptr;
}
static struct file *io_file_get(struct io_submit_state *state,
@@ -6226,18 +6306,26 @@ static struct file *io_file_get(struct io_submit_state *state,
struct file *file;
if (fixed) {
+ unsigned long file_ptr;
+
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
- file = io_file_from_index(ctx, fd);
- io_set_resource_node(req);
+ file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+ file = (struct file *) (file_ptr & FFS_MASK);
+ file_ptr &= ~FFS_MASK;
+ /* mask in overlapping REQ_F and FFS bits */
+ req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
+ io_req_set_rsrc_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
+
+ /* we don't allow fixed io_uring files */
+ if (file && unlikely(file->f_op == &io_uring_fops))
+ io_req_track_inflight(req);
}
- if (file && unlikely(file->f_op == &io_uring_fops))
- io_req_track_inflight(req);
return file;
}
@@ -6257,7 +6345,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
* We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work.
*/
- if (prev && refcount_inc_not_zero(&prev->refs))
+ if (prev && req_ref_inc_not_zero(prev))
io_remove_next_linked(prev);
else
prev = NULL;
@@ -6268,13 +6356,16 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
io_put_req_deferred(prev, 1);
} else {
io_req_complete_post(req, -ETIME, 0);
- io_put_req_deferred(req, 1);
}
+ io_put_req_deferred(req, 1);
return HRTIMER_NORESTART;
}
-static void __io_queue_linked_timeout(struct io_kiocb *req)
+static void io_queue_linked_timeout(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
/*
* If the back reference is NULL, then our linked request finished
* before we got a chance to setup the timer
@@ -6286,16 +6377,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req)
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
}
-}
-
-static void io_queue_linked_timeout(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->completion_lock);
- __io_queue_linked_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
-
/* drop submission reference */
io_put_req(req);
}
@@ -6325,15 +6407,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- if (!io_arm_poll_handler(req)) {
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req);
- }
- } else if (likely(!ret)) {
+ if (likely(!ret)) {
/* drop submission reference */
if (req->flags & REQ_F_COMPLETE_INLINE) {
struct io_ring_ctx *ctx = req->ctx;
@@ -6345,10 +6419,16 @@ static void __io_queue_sqe(struct io_kiocb *req)
} else {
io_put_req(req);
}
+ } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ if (!io_arm_poll_handler(req)) {
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req);
+ }
} else {
- req_set_fail_links(req);
- io_put_req(req);
- io_req_complete(req, ret);
+ io_req_complete_failed(req, ret);
}
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
@@ -6362,12 +6442,10 @@ static void io_queue_sqe(struct io_kiocb *req)
if (ret) {
if (ret != -EIOCBQUEUED) {
fail_req:
- req_set_fail_links(req);
- io_put_req(req);
- io_req_complete(req, ret);
+ io_req_complete_failed(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- ret = io_req_defer_prep(req);
+ ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
io_queue_async_work(req);
@@ -6419,12 +6497,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->link = NULL;
req->fixed_rsrc_refs = NULL;
/* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
+ atomic_set(&req->refs, 2);
req->task = current;
req->result = 0;
- req->work.list.next = NULL;
req->work.creds = NULL;
- req->work.flags = 0;
/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
@@ -6485,12 +6561,10 @@ fail_req:
if (link->head) {
/* fail even hard links since we don't submit */
link->head->flags |= REQ_F_FAIL_LINK;
- io_put_req(link->head);
- io_req_complete(link->head, -ECANCELED);
+ io_req_complete_failed(link->head, -ECANCELED);
link->head = NULL;
}
- io_put_req(req);
- io_req_complete(req, ret);
+ io_req_complete_failed(req, ret);
return ret;
}
ret = io_req_prep(req, sqe);
@@ -6522,7 +6596,7 @@ fail_req:
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- ret = io_req_defer_prep(req);
+ ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
trace_io_uring_link(ctx, req, head);
@@ -6624,12 +6698,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
int submitted = 0;
- /* if we have a backlog and couldn't flush it all, return BUSY */
- if (test_bit(0, &ctx->sq_check_overflow)) {
- if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
- return -EBUSY;
- }
-
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
@@ -6710,6 +6778,10 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
+ /*
+ * Don't submit if refs are dying, good for io_uring_register(),
+ * but also it is relied upon by io_ring_exit_work()
+ */
if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
@@ -6727,11 +6799,8 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- if (sq_thread_idle < ctx->sq_thread_idle)
- sq_thread_idle = ctx->sq_thread_idle;
- }
-
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
sqd->sq_thread_idle = sq_thread_idle;
}
@@ -6745,7 +6814,6 @@ static int io_sq_thread(void *data)
snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
set_task_comm(current, buf);
- current->pf_io_worker = NULL;
if (sqd->sq_cpu != -1)
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
@@ -6802,27 +6870,29 @@ static int io_sq_thread(void *data)
continue;
}
- needs_sched = true;
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- needs_sched = false;
- break;
- }
- if (io_sqring_entries(ctx)) {
- needs_sched = false;
- break;
- }
- }
-
- if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+ if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
- mutex_unlock(&sqd->lock);
- schedule();
- mutex_lock(&sqd->lock);
+ needs_sched = true;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ needs_sched = false;
+ break;
+ }
+ if (io_sqring_entries(ctx)) {
+ needs_sched = false;
+ break;
+ }
+ }
+
+ if (needs_sched) {
+ mutex_unlock(&sqd->lock);
+ schedule();
+ mutex_lock(&sqd->lock);
+ }
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
@@ -6832,15 +6902,14 @@ static int io_sq_thread(void *data)
timeout = jiffies + sqd->sq_thread_idle;
}
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_cancel_sqpoll(ctx);
+ io_uring_cancel_sqpoll(sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
- mutex_unlock(&sqd->lock);
-
io_run_task_work();
io_run_task_work_head(&sqd->park_task_work);
+ mutex_unlock(&sqd->lock);
+
complete(&sqd->exited);
do_exit(0);
}
@@ -6932,7 +7001,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
int ret;
do {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx) >= min_events)
return 0;
if (!io_run_task_work())
@@ -6964,7 +7033,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
trace_io_uring_cqring_wait(ctx, min_events);
do {
/* if we can't even flush overflow, don't wait for more */
- if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+ if (!io_cqring_overflow_flush(ctx, false)) {
ret = -EBUSY;
break;
}
@@ -6980,35 +7049,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
{
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
- kfree_skb(skb);
- }
-#else
- int i;
+ unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
- for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
-
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
-#endif
-}
-
-static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
-{
- struct fixed_rsrc_data *data;
-
- data = container_of(ref, struct fixed_rsrc_data, refs);
- complete(&data->done);
+ for (i = 0; i < nr_tables; i++)
+ kfree(table->files[i]);
+ kfree(table->files);
+ table->files = NULL;
}
static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7021,122 +7069,148 @@ static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
spin_unlock_bh(&ctx->rsrc_ref_lock);
}
-static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
- struct fixed_rsrc_data *rsrc_data,
- struct fixed_rsrc_ref_node *ref_node)
+static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
- io_rsrc_ref_lock(ctx);
- rsrc_data->node = ref_node;
- list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
- io_rsrc_ref_unlock(ctx);
- percpu_ref_get(&rsrc_data->refs);
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
-static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
+static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+ struct io_rsrc_data *data_to_kill)
{
- struct fixed_rsrc_ref_node *ref_node = NULL;
+ WARN_ON_ONCE(!ctx->rsrc_backup_node);
+ WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
- io_rsrc_ref_lock(ctx);
- ref_node = data->node;
- data->node = NULL;
- io_rsrc_ref_unlock(ctx);
- if (ref_node)
- percpu_ref_kill(&ref_node->refs);
+ if (data_to_kill) {
+ struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+
+ rsrc_node->rsrc_data = data_to_kill;
+ io_rsrc_ref_lock(ctx);
+ list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
+ io_rsrc_ref_unlock(ctx);
+
+ atomic_inc(&data_to_kill->refs);
+ percpu_ref_kill(&rsrc_node->refs);
+ ctx->rsrc_node = NULL;
+ }
+
+ if (!ctx->rsrc_node) {
+ ctx->rsrc_node = ctx->rsrc_backup_node;
+ ctx->rsrc_backup_node = NULL;
+ }
+}
+
+static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+{
+ if (ctx->rsrc_backup_node)
+ return 0;
+ ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
+ return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
- struct io_ring_ctx *ctx,
- void (*rsrc_put)(struct io_ring_ctx *ctx,
- struct io_rsrc_put *prsrc))
+static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
{
- struct fixed_rsrc_ref_node *backup_node;
int ret;
+ /* As we may drop ->uring_lock, other task may have started quiesce */
if (data->quiesce)
return -ENXIO;
data->quiesce = true;
do {
- ret = -ENOMEM;
- backup_node = alloc_fixed_rsrc_ref_node(ctx);
- if (!backup_node)
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
break;
- backup_node->rsrc_data = data;
- backup_node->rsrc_put = rsrc_put;
+ io_rsrc_node_switch(ctx, data);
- io_sqe_rsrc_kill_node(ctx, data);
- percpu_ref_kill(&data->refs);
+ /* kill initial ref, already quiesced if zero */
+ if (atomic_dec_and_test(&data->refs))
+ break;
flush_delayed_work(&ctx->rsrc_put_work);
-
ret = wait_for_completion_interruptible(&data->done);
if (!ret)
break;
- percpu_ref_resurrect(&data->refs);
- io_sqe_rsrc_set_node(ctx, data, backup_node);
- backup_node = NULL;
+ atomic_inc(&data->refs);
+ /* wait for all works potentially completing data->done */
+ flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
+
mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
mutex_lock(&ctx->uring_lock);
} while (ret >= 0);
data->quiesce = false;
- if (backup_node)
- destroy_fixed_rsrc_ref_node(backup_node);
return ret;
}
-static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
+static void io_rsrc_data_free(struct io_rsrc_data *data)
{
- struct fixed_rsrc_data *data;
+ kvfree(data->tags);
+ kfree(data);
+}
+
+static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
+ rsrc_put_fn *do_put,
+ unsigned nr)
+{
+ struct io_rsrc_data *data;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return NULL;
- if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+ if (!data->tags) {
kfree(data);
return NULL;
}
+
+ atomic_set(&data->refs, 1);
data->ctx = ctx;
+ data->do_put = do_put;
init_completion(&data->done);
return data;
}
-static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- percpu_ref_exit(&data->refs);
- kfree(data->table);
- kfree(data);
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock) {
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+ kfree_skb(skb);
+ }
+#else
+ int i;
+
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file;
+
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
+#endif
+ io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+ io_rsrc_data_free(ctx->file_data);
+ ctx->file_data = NULL;
+ ctx->nr_user_files = 0;
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- struct fixed_rsrc_data *data = ctx->file_data;
- unsigned nr_tables, i;
int ret;
- /*
- * percpu_ref_is_dying() is to stop parallel files unregister
- * Since we possibly drop uring lock later in this function to
- * run task work.
- */
- if (!data || percpu_ref_is_dying(&data->refs))
+ if (!ctx->file_data)
return -ENXIO;
- ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
- if (ret)
- return ret;
-
- __io_sqe_files_unregister(ctx);
- nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
- for (i = 0; i < nr_tables; i++)
- kfree(data->table[i].files);
- free_fixed_rsrc_data(data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
- return 0;
+ ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+ if (!ret)
+ __io_sqe_files_unregister(ctx);
+ return ret;
}
static void io_sq_thread_unpark(struct io_sq_data *sqd)
@@ -7169,9 +7243,10 @@ static void io_sq_thread_park(struct io_sq_data *sqd)
static void io_sq_thread_stop(struct io_sq_data *sqd)
{
WARN_ON_ONCE(sqd->thread == current);
+ WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
- mutex_lock(&sqd->lock);
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ mutex_lock(&sqd->lock);
if (sqd->thread)
wake_up_process(sqd->thread);
mutex_unlock(&sqd->lock);
@@ -7200,8 +7275,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
io_put_sq_data(sqd);
ctx->sq_data = NULL;
- if (ctx->sq_creds)
- put_cred(ctx->sq_creds);
}
}
@@ -7362,34 +7435,32 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
- unsigned nr_tables, unsigned nr_files)
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- int i;
+ unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+
+ table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
+ if (!table->files)
+ return false;
for (i = 0; i < nr_tables; i++) {
- struct fixed_rsrc_table *table = &file_data->table[i];
- unsigned this_files;
+ unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
- this_files = min(nr_files, IORING_MAX_FILES_TABLE);
- table->files = kcalloc(this_files, sizeof(struct file *),
+ table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
GFP_KERNEL);
- if (!table->files)
+ if (!table->files[i])
break;
nr_files -= this_files;
}
if (i == nr_tables)
- return 0;
+ return true;
- for (i = 0; i < nr_tables; i++) {
- struct fixed_rsrc_table *table = &file_data->table[i];
- kfree(table->files);
- }
- return 1;
+ io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
+ return false;
}
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
#if defined(CONFIG_UNIX)
@@ -7452,21 +7523,35 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
#endif
}
-static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
+static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
- struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+ struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
struct io_ring_ctx *ctx = rsrc_data->ctx;
struct io_rsrc_put *prsrc, *tmp;
list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
list_del(&prsrc->list);
- ref_node->rsrc_put(ctx, prsrc);
+
+ if (prsrc->tag) {
+ bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+ unsigned long flags;
+
+ io_ring_submit_lock(ctx, lock_ring);
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
+ io_ring_submit_unlock(ctx, lock_ring);
+ }
+
+ rsrc_data->do_put(ctx, prsrc);
kfree(prsrc);
}
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
- percpu_ref_put(&rsrc_data->refs);
+ io_rsrc_node_destroy(ref_node);
+ if (atomic_dec_and_test(&rsrc_data->refs))
+ complete(&rsrc_data->done);
}
static void io_rsrc_put_work(struct work_struct *work)
@@ -7478,63 +7563,42 @@ static void io_rsrc_put_work(struct work_struct *work)
node = llist_del_all(&ctx->rsrc_put_llist);
while (node) {
- struct fixed_rsrc_ref_node *ref_node;
+ struct io_rsrc_node *ref_node;
struct llist_node *next = node->next;
- ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+ ref_node = llist_entry(node, struct io_rsrc_node, llist);
__io_rsrc_put_work(ref_node);
node = next;
}
}
-static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
- unsigned i)
-{
- struct fixed_rsrc_table *table;
-
- table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- return &table->files[i & IORING_FILE_TABLE_MASK];
-}
-
static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
- struct fixed_rsrc_ref_node *ref_node;
- struct fixed_rsrc_data *data;
- struct io_ring_ctx *ctx;
+ struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+ struct io_ring_ctx *ctx = node->rsrc_data->ctx;
bool first_add = false;
- int delay = HZ;
-
- ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
- data = ref_node->rsrc_data;
- ctx = data->ctx;
io_rsrc_ref_lock(ctx);
- ref_node->done = true;
+ node->done = true;
while (!list_empty(&ctx->rsrc_ref_list)) {
- ref_node = list_first_entry(&ctx->rsrc_ref_list,
- struct fixed_rsrc_ref_node, node);
+ node = list_first_entry(&ctx->rsrc_ref_list,
+ struct io_rsrc_node, node);
/* recycle ref nodes in order */
- if (!ref_node->done)
+ if (!node->done)
break;
- list_del(&ref_node->node);
- first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
+ list_del(&node->node);
+ first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
io_rsrc_ref_unlock(ctx);
- if (percpu_ref_is_dying(&data->refs))
- delay = 0;
-
- if (!delay)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
- else if (first_add)
- queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+ if (first_add)
+ mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
}
-static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
- struct io_ring_ctx *ctx)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
- struct fixed_rsrc_ref_node *ref_node;
+ struct io_rsrc_node *ref_node;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
@@ -7551,29 +7615,14 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
return ref_node;
}
-static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
- struct fixed_rsrc_ref_node *ref_node)
-{
- ref_node->rsrc_data = ctx->file_data;
- ref_node->rsrc_put = io_ring_file_put;
-}
-
-static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
-{
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
-}
-
-
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
+ unsigned nr_args, u64 __user *tags)
{
__s32 __user *fds = (__s32 __user *) arg;
- unsigned nr_tables, i;
struct file *file;
- int fd, ret = -ENOMEM;
- struct fixed_rsrc_ref_node *ref_node;
- struct fixed_rsrc_data *file_data;
+ int fd, ret;
+ unsigned i;
+ struct io_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7581,33 +7630,37 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
+ return ret;
- file_data = alloc_fixed_rsrc_data(ctx);
+ file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
if (!file_data)
return -ENOMEM;
ctx->file_data = file_data;
-
- nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
- file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
- GFP_KERNEL);
- if (!file_data->table)
- goto out_free;
-
- if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
+ ret = -ENOMEM;
+ if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ u64 tag = 0;
+
+ if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
+ copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
}
/* allow sparse sets */
- if (fd == -1)
+ if (fd == -1) {
+ ret = -EINVAL;
+ if (unlikely(tag))
+ goto out_fput;
continue;
+ }
file = fget(fd);
ret = -EBADF;
- if (!file)
+ if (unlikely(!file))
goto out_fput;
/*
@@ -7621,23 +7674,17 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto out_fput;
}
- *io_fixed_file_slot(file_data, i) = file;
+ ctx->file_data->tags[i] = tag;
+ io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}
ret = io_sqe_files_scm(ctx);
if (ret) {
- io_sqe_files_unregister(ctx);
+ __io_sqe_files_unregister(ctx);
return ret;
}
- ref_node = alloc_fixed_rsrc_ref_node(ctx);
- if (!ref_node) {
- io_sqe_files_unregister(ctx);
- return -ENOMEM;
- }
- init_fixed_file_ref_node(ctx, ref_node);
-
- io_sqe_rsrc_set_node(ctx, file_data, ref_node);
+ io_rsrc_node_switch(ctx, NULL);
return ret;
out_fput:
for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7645,11 +7692,10 @@ out_fput:
if (file)
fput(file);
}
- for (i = 0; i < nr_tables; i++)
- kfree(file_data->table[i].files);
+ io_free_file_tables(&ctx->file_table, nr_args);
ctx->nr_user_files = 0;
out_free:
- free_fixed_rsrc_data(ctx->file_data);
+ io_rsrc_data_free(ctx->file_data);
ctx->file_data = NULL;
return ret;
}
@@ -7697,67 +7743,64 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+ struct io_rsrc_node *node, void *rsrc)
{
struct io_rsrc_put *prsrc;
- struct fixed_rsrc_ref_node *ref_node = data->node;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
if (!prsrc)
return -ENOMEM;
+ prsrc->tag = data->tags[idx];
prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &ref_node->rsrc_list);
-
+ list_add(&prsrc->list, &node->rsrc_list);
return 0;
}
-static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
- struct file *file)
-{
- return io_queue_rsrc_removal(data, (void *)file);
-}
-
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_rsrc_update *up,
+ struct io_uring_rsrc_update2 *up,
unsigned nr_args)
{
- struct fixed_rsrc_data *data = ctx->file_data;
- struct fixed_rsrc_ref_node *ref_node;
- struct file *file, **file_slot;
- __s32 __user *fds;
- int fd, i, err;
- __u32 done;
+ u64 __user *tags = u64_to_user_ptr(up->tags);
+ __s32 __user *fds = u64_to_user_ptr(up->data);
+ struct io_rsrc_data *data = ctx->file_data;
+ struct io_fixed_file *file_slot;
+ struct file *file;
+ int fd, i, err = 0;
+ unsigned int done;
bool needs_switch = false;
- if (check_add_overflow(up->offset, nr_args, &done))
- return -EOVERFLOW;
- if (done > ctx->nr_user_files)
+ if (!ctx->file_data)
+ return -ENXIO;
+ if (up->offset + nr_args > ctx->nr_user_files)
return -EINVAL;
- ref_node = alloc_fixed_rsrc_ref_node(ctx);
- if (!ref_node)
- return -ENOMEM;
- init_fixed_file_ref_node(ctx, ref_node);
-
- fds = u64_to_user_ptr(up->data);
for (done = 0; done < nr_args; done++) {
- err = 0;
- if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+ u64 tag = 0;
+
+ if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+ copy_from_user(&fd, &fds[done], sizeof(fd))) {
err = -EFAULT;
break;
}
+ if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+ err = -EINVAL;
+ break;
+ }
if (fd == IORING_REGISTER_FILES_SKIP)
continue;
i = array_index_nospec(up->offset + done, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(ctx->file_data, i);
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
- if (*file_slot) {
- err = io_queue_file_removal(data, *file_slot);
+ if (file_slot->file_ptr) {
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ err = io_queue_rsrc_removal(data, up->offset + done,
+ ctx->rsrc_node, file);
if (err)
break;
- *file_slot = NULL;
+ file_slot->file_ptr = 0;
needs_switch = true;
}
if (fd != -1) {
@@ -7779,42 +7822,22 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *file_slot = file;
+ data->tags[up->offset + done] = tag;
+ io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
- *file_slot = NULL;
+ file_slot->file_ptr = 0;
fput(file);
break;
}
}
}
- if (needs_switch) {
- percpu_ref_kill(&data->node->refs);
- io_sqe_rsrc_set_node(ctx, data, ref_node);
- } else
- destroy_fixed_rsrc_ref_node(ref_node);
-
+ if (needs_switch)
+ io_rsrc_node_switch(ctx, data);
return done ? done : err;
}
-static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update up;
-
- if (!ctx->file_data)
- return -ENXIO;
- if (!nr_args)
- return -EINVAL;
- if (copy_from_user(&up, arg, sizeof(up)))
- return -EFAULT;
- if (up.resv)
- return -EINVAL;
-
- return __io_sqe_files_update(ctx, &up, nr_args);
-}
-
static struct io_wq_work *io_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -7823,7 +7846,8 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
return req ? &req->work : NULL;
}
-static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
+ struct task_struct *task)
{
struct io_wq_hash *hash;
struct io_wq_data data;
@@ -7840,6 +7864,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
}
data.hash = hash;
+ data.task = task;
data.free_work = io_free_work;
data.do_work = io_wq_submit_work;
@@ -7865,7 +7890,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
return ret;
}
- tctx->io_wq = io_init_wq_offload(ctx);
+ tctx->io_wq = io_init_wq_offload(ctx, task);
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
@@ -7877,6 +7902,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
+ atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@@ -7910,21 +7936,15 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
f = fdget(p->wq_fd);
if (!f.file)
return -ENXIO;
- if (f.file->f_op != &io_uring_fops) {
- fdput(f);
- return -EINVAL;
- }
fdput(f);
+ if (f.file->f_op != &io_uring_fops)
+ return -EINVAL;
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct task_struct *tsk;
struct io_sq_data *sqd;
bool attached;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
- goto err;
-
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
@@ -7937,13 +7957,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
if (!ctx->sq_thread_idle)
ctx->sq_thread_idle = HZ;
- ret = 0;
io_sq_thread_park(sqd);
list_add(&ctx->sqd_list, &sqd->ctx_list);
io_sqd_update_thread_idle(sqd);
/* don't attach to a dying SQPOLL thread, would be racy */
- if (attached && !sqd->thread)
- ret = -ENXIO;
+ ret = (attached && !sqd->thread) ? -ENXIO : 0;
io_sq_thread_unpark(sqd);
if (ret < 0)
@@ -7955,11 +7973,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
int cpu = p->sq_thread_cpu;
ret = -EINVAL;
- if (cpu >= nr_cpu_ids)
- goto err_sqpoll;
- if (!cpu_online(cpu))
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
goto err_sqpoll;
-
sqd->sq_cpu = cpu;
} else {
sqd->sq_cpu = -1;
@@ -7985,12 +8000,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
}
return 0;
+err_sqpoll:
+ complete(&ctx->sq_data->exited);
err:
io_sq_thread_finish(ctx);
return ret;
-err_sqpoll:
- complete(&ctx->sq_data->exited);
- goto err;
}
static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8092,29 +8106,49 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return off;
}
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
{
- int i, j;
-
- if (!ctx->user_bufs)
- return -ENXIO;
+ struct io_mapped_ubuf *imu = *slot;
+ unsigned int i;
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ for (i = 0; i < imu->nr_bvecs; i++)
+ unpin_user_page(imu->bvec[i].bv_page);
+ if (imu->acct_pages)
+ io_unaccount_mem(ctx, imu->acct_pages);
+ kvfree(imu);
+ *slot = NULL;
+}
- for (j = 0; j < imu->nr_bvecs; j++)
- unpin_user_page(imu->bvec[j].bv_page);
+static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+ io_buffer_unmap(ctx, &prsrc->buf);
+ prsrc->buf = NULL;
+}
- if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages);
- kvfree(imu->bvec);
- imu->nr_bvecs = 0;
- }
+static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+ unsigned int i;
+ for (i = 0; i < ctx->nr_user_bufs; i++)
+ io_buffer_unmap(ctx, &ctx->user_bufs[i]);
kfree(ctx->user_bufs);
+ kfree(ctx->buf_data);
ctx->user_bufs = NULL;
+ ctx->buf_data = NULL;
ctx->nr_user_bufs = 0;
- return 0;
+}
+
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+ int ret;
+
+ if (!ctx->buf_data)
+ return -ENXIO;
+
+ ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+ if (!ret)
+ __io_sqe_buffers_unregister(ctx);
+ return ret;
}
static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
@@ -8166,7 +8200,7 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
/* check previously registered pages */
for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
for (j = 0; j < imu->nr_bvecs; j++) {
if (!PageCompound(imu->bvec[j].bv_page))
@@ -8211,9 +8245,10 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf *imu,
+ struct io_mapped_ubuf **pimu,
struct page **last_hpage)
{
+ struct io_mapped_ubuf *imu = NULL;
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
unsigned long off, start, end, ubuf;
@@ -8225,6 +8260,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
+ *pimu = NULL;
ret = -ENOMEM;
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
@@ -8236,9 +8272,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
if (!vmas)
goto done;
- imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
- if (!imu->bvec)
+ imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ if (!imu)
goto done;
ret = 0;
@@ -8267,14 +8302,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
*/
if (pret > 0)
unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
goto done;
}
ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
if (ret) {
unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
goto done;
}
@@ -8292,10 +8325,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
}
/* store original address for later verification */
imu->ubuf = ubuf;
- imu->len = iov->iov_len;
+ imu->ubuf_end = ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ *pimu = imu;
ret = 0;
done:
+ if (ret)
+ kvfree(imu);
kvfree(pages);
kvfree(vmas);
return ret;
@@ -8303,21 +8339,14 @@ done:
static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
- if (ctx->user_bufs)
- return -EBUSY;
- if (!nr_args || nr_args > UIO_MAXIOV)
- return -EINVAL;
-
- ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
- GFP_KERNEL);
- if (!ctx->user_bufs)
- return -ENOMEM;
-
- return 0;
+ ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+ return ctx->user_bufs ? 0 : -ENOMEM;
}
static int io_buffer_validate(struct iovec *iov)
{
+ unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
@@ -8330,44 +8359,123 @@ static int io_buffer_validate(struct iovec *iov)
if (iov->iov_len > SZ_1G)
return -EFAULT;
+ if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
+ return -EOVERFLOW;
+
return 0;
}
static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+ unsigned int nr_args, u64 __user *tags)
{
+ struct page *last_hpage = NULL;
+ struct io_rsrc_data *data;
int i, ret;
struct iovec iov;
- struct page *last_hpage = NULL;
- ret = io_buffers_map_alloc(ctx, nr_args);
+ if (ctx->user_bufs)
+ return -EBUSY;
+ if (!nr_args || nr_args > UIO_MAXIOV)
+ return -EINVAL;
+ ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
+ data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
+ if (!data)
+ return -ENOMEM;
+ ret = io_buffers_map_alloc(ctx, nr_args);
+ if (ret) {
+ kfree(data);
+ return ret;
+ }
- for (i = 0; i < nr_args; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+ u64 tag = 0;
+ if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
+ ret = -EFAULT;
+ break;
+ }
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
-
ret = io_buffer_validate(&iov);
if (ret)
break;
- ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+ ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+ &last_hpage);
if (ret)
break;
-
- ctx->nr_user_bufs++;
+ data->tags[i] = tag;
}
- if (ret)
- io_sqe_buffers_unregister(ctx);
+ WARN_ON_ONCE(ctx->buf_data);
+ ctx->buf_data = data;
+ if (ret)
+ __io_sqe_buffers_unregister(ctx);
+ else
+ io_rsrc_node_switch(ctx, NULL);
return ret;
}
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+ struct io_uring_rsrc_update2 *up,
+ unsigned int nr_args)
+{
+ u64 __user *tags = u64_to_user_ptr(up->tags);
+ struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+ struct page *last_hpage = NULL;
+ bool needs_switch = false;
+ __u32 done;
+ int i, err;
+
+ if (!ctx->buf_data)
+ return -ENXIO;
+ if (up->offset + nr_args > ctx->nr_user_bufs)
+ return -EINVAL;
+
+ for (done = 0; done < nr_args; done++) {
+ struct io_mapped_ubuf *imu;
+ int offset = up->offset + done;
+ u64 tag = 0;
+
+ err = io_copy_iov(ctx, &iov, iovs, done);
+ if (err)
+ break;
+ if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+ err = -EFAULT;
+ break;
+ }
+ err = io_buffer_validate(&iov);
+ if (err)
+ break;
+ err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+ if (err)
+ break;
+
+ i = array_index_nospec(offset, ctx->nr_user_bufs);
+ if (ctx->user_bufs[i]) {
+ err = io_queue_rsrc_removal(ctx->buf_data, offset,
+ ctx->rsrc_node, ctx->user_bufs[i]);
+ if (unlikely(err)) {
+ io_buffer_unmap(ctx, &imu);
+ break;
+ }
+ ctx->user_bufs[i] = NULL;
+ needs_switch = true;
+ }
+
+ ctx->user_bufs[i] = imu;
+ ctx->buf_data->tags[offset] = tag;
+ }
+
+ if (needs_switch)
+ io_rsrc_node_switch(ctx, ctx->buf_data);
+ return done ? done : err;
+}
+
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
__s32 __user *fds = arg;
@@ -8434,30 +8542,23 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
submit_state->free_reqs = 0;
}
- spin_lock_irq(&ctx->completion_lock);
- list_splice_init(&cs->locked_free_list, &cs->free_list);
- cs->locked_free_nr = 0;
- spin_unlock_irq(&ctx->completion_lock);
-
+ io_flush_cached_locked_reqs(ctx, cs);
io_req_cache_free(&cs->free_list, NULL);
-
mutex_unlock(&ctx->uring_lock);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static bool io_wait_rsrc_data(struct io_rsrc_data *data)
{
- /*
- * Some may use context even when all refs and requests have been put,
- * and they are free to do so while still holding uring_lock or
- * completion_lock, see __io_req_task_submit(). Wait for them to finish.
- */
- mutex_lock(&ctx->uring_lock);
- mutex_unlock(&ctx->uring_lock);
- spin_lock_irq(&ctx->completion_lock);
- spin_unlock_irq(&ctx->completion_lock);
+ if (!data)
+ return false;
+ if (!atomic_dec_and_test(&data->refs))
+ wait_for_completion(&data->done);
+ return true;
+}
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
io_sq_thread_finish(ctx);
- io_sqe_buffers_unregister(ctx);
if (ctx->mm_account) {
mmdrop(ctx->mm_account);
@@ -8465,10 +8566,27 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
}
mutex_lock(&ctx->uring_lock);
- io_sqe_files_unregister(ctx);
+ if (io_wait_rsrc_data(ctx->buf_data))
+ __io_sqe_buffers_unregister(ctx);
+ if (io_wait_rsrc_data(ctx->file_data))
+ __io_sqe_files_unregister(ctx);
+ if (ctx->rings)
+ __io_cqring_overflow_flush(ctx, true);
mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
+ if (ctx->sq_creds)
+ put_cred(ctx->sq_creds);
+
+ /* there are no registered resources left, nobody uses it */
+ if (ctx->rsrc_node)
+ io_rsrc_node_destroy(ctx->rsrc_node);
+ if (ctx->rsrc_backup_node)
+ io_rsrc_node_destroy(ctx->rsrc_backup_node);
+ flush_delayed_work(&ctx->rsrc_put_work);
+
+ WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
+ WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
@@ -8568,6 +8686,13 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
+static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ return req->ctx == data;
+}
+
static void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
@@ -8576,14 +8701,6 @@ static void io_ring_exit_work(struct work_struct *work)
struct io_tctx_node *node;
int ret;
- /* prevent SQPOLL from submitting new requests */
- if (ctx->sq_data) {
- io_sq_thread_park(ctx->sq_data);
- list_del_init(&ctx->sqd_list);
- io_sqd_update_thread_idle(ctx->sq_data);
- io_sq_thread_unpark(ctx->sq_data);
- }
-
/*
* If we're doing polled IO and end up having requests being
* submitted async (out-of-line), then completions can come in while
@@ -8592,19 +8709,38 @@ static void io_ring_exit_work(struct work_struct *work)
*/
do {
io_uring_try_cancel_requests(ctx, NULL, NULL);
+ if (ctx->sq_data) {
+ struct io_sq_data *sqd = ctx->sq_data;
+ struct task_struct *tsk;
+
+ io_sq_thread_park(sqd);
+ tsk = sqd->thread;
+ if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
+ io_wq_cancel_cb(tsk->io_uring->io_wq,
+ io_cancel_ctx_cb, ctx, true);
+ io_sq_thread_unpark(sqd);
+ }
WARN_ON_ONCE(time_after(jiffies, timeout));
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+ init_completion(&exit.completion);
+ init_task_work(&exit.task_work, io_tctx_exit_cb);
+ exit.ctx = ctx;
+ /*
+ * Some may use context even when all refs and requests have been put,
+ * and they are free to do so while still holding uring_lock or
+ * completion_lock, see __io_req_task_submit(). Apart from other work,
+ * this lock/unlock section also waits them to finish.
+ */
mutex_lock(&ctx->uring_lock);
while (!list_empty(&ctx->tctx_list)) {
WARN_ON_ONCE(time_after(jiffies, timeout));
node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
ctx_node);
- exit.ctx = ctx;
- init_completion(&exit.completion);
- init_task_work(&exit.task_work, io_tctx_exit_cb);
+ /* don't spin on a single task if cancellation failed */
+ list_rotate_left(&ctx->tctx_list);
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
@@ -8612,10 +8748,11 @@ static void io_ring_exit_work(struct work_struct *work)
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
- cond_resched();
mutex_lock(&ctx->uring_lock);
}
mutex_unlock(&ctx->uring_lock);
+ spin_lock_irq(&ctx->completion_lock);
+ spin_unlock_irq(&ctx->completion_lock);
io_ring_ctx_free(ctx);
}
@@ -8649,10 +8786,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
- /* if force is set, the ring is going away. always drop after that */
- ctx->cq_overflow_flushed = 1;
if (ctx->rings)
- __io_cqring_overflow_flush(ctx, true, NULL, NULL);
+ __io_cqring_overflow_flush(ctx, true);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
@@ -8728,21 +8863,12 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
- req_set_fail_links(de->req);
- io_put_req(de->req);
- io_req_complete(de->req, -ECANCELED);
+ io_req_complete_failed(de->req, -ECANCELED);
kfree(de);
}
return true;
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- return req->ctx == data;
-}
-
static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
@@ -8804,53 +8930,13 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_kill_timeouts(ctx, task, files);
ret |= io_run_task_work();
ret |= io_run_ctx_fallback(ctx);
- io_cqring_overflow_flush(ctx, true, task, files);
if (!ret)
break;
cond_resched();
}
}
-static int io_uring_count_inflight(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- struct io_kiocb *req;
- int cnt = 0;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
- cnt += io_match_task(req, task, files);
- spin_unlock_irq(&ctx->inflight_lock);
- return cnt;
-}
-
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- while (!list_empty_careful(&ctx->inflight_list)) {
- DEFINE_WAIT(wait);
- int inflight;
-
- inflight = io_uring_count_inflight(ctx, task, files);
- if (!inflight)
- break;
-
- io_uring_try_cancel_requests(ctx, task, files);
-
- prepare_to_wait(&task->io_uring->wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (inflight == io_uring_count_inflight(ctx, task, files))
- schedule();
- finish_wait(&task->io_uring->wait, &wait);
- }
-}
-
-/*
- * Note that this task has used io_uring. We use it for cancelation purposes.
- */
-static int io_uring_add_task_file(struct io_ring_ctx *ctx)
+static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -8862,33 +8948,41 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx)
return ret;
tctx = current->io_uring;
}
- if (tctx->last != ctx) {
- void *old = xa_load(&tctx->xa, (unsigned long)ctx);
-
- if (!old) {
- node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
- return -ENOMEM;
- node->ctx = ctx;
- node->task = current;
-
- ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
- node, GFP_KERNEL));
- if (ret) {
- kfree(node);
- return ret;
- }
+ if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+ node->ctx = ctx;
+ node->task = current;
- mutex_lock(&ctx->uring_lock);
- list_add(&node->ctx_node, &ctx->tctx_list);
- mutex_unlock(&ctx->uring_lock);
+ ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+ node, GFP_KERNEL));
+ if (ret) {
+ kfree(node);
+ return ret;
}
- tctx->last = ctx;
+
+ mutex_lock(&ctx->uring_lock);
+ list_add(&node->ctx_node, &ctx->tctx_list);
+ mutex_unlock(&ctx->uring_lock);
}
+ tctx->last = ctx;
return 0;
}
/*
+ * Note that this task has used io_uring. We use it for cancelation purposes.
+ */
+static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (likely(tctx && tctx->last == ctx))
+ return 0;
+ return __io_uring_add_task_file(ctx);
+}
+
+/*
* Remove this io_uring_file -> task mapping.
*/
static void io_uring_del_task_file(unsigned long index)
@@ -8927,86 +9021,48 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
}
}
-static s64 tctx_inflight(struct io_uring_task *tctx)
+static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
+ if (tracked)
+ return atomic_read(&tctx->inflight_tracked);
return percpu_counter_sum(&tctx->inflight);
}
-static void io_sqpoll_cancel_cb(struct callback_head *cb)
-{
- struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
- struct io_ring_ctx *ctx = work->ctx;
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd->thread)
- io_uring_cancel_sqpoll(ctx);
- complete(&work->completion);
-}
-
-static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
- struct io_tctx_exit work = { .ctx = ctx, };
- struct task_struct *task;
-
- io_sq_thread_park(sqd);
- list_del_init(&ctx->sqd_list);
- io_sqd_update_thread_idle(sqd);
- task = sqd->thread;
- if (task) {
- init_completion(&work.completion);
- init_task_work(&work.task_work, io_sqpoll_cancel_cb);
- io_task_work_add_head(&sqd->park_task_work, &work.task_work);
- wake_up_process(task);
- }
- io_sq_thread_unpark(sqd);
-
- if (task)
- wait_for_completion(&work.completion);
-}
-
-void __io_uring_files_cancel(struct files_struct *files)
+static void io_uring_try_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
unsigned long index;
- /* make sure overflow events are dropped */
- atomic_inc(&tctx->in_idle);
xa_for_each(&tctx->xa, index, node) {
struct io_ring_ctx *ctx = node->ctx;
- if (ctx->sq_data) {
- io_sqpoll_cancel_sync(ctx);
- continue;
- }
- io_uring_cancel_files(ctx, current, files);
- if (!files)
- io_uring_try_cancel_requests(ctx, current, NULL);
+ /* sqpoll task will cancel all its requests */
+ if (!ctx->sq_data)
+ io_uring_try_cancel_requests(ctx, current, files);
}
- atomic_dec(&tctx->in_idle);
-
- if (files)
- io_uring_clean_tctx(tctx);
}
/* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
+static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
{
- struct io_sq_data *sqd = ctx->sq_data;
struct io_uring_task *tctx = current->io_uring;
+ struct io_ring_ctx *ctx;
s64 inflight;
DEFINE_WAIT(wait);
- WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
+ if (!current->io_uring)
+ return;
+ WARN_ON_ONCE(!sqd || sqd->thread != current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx);
+ inflight = tctx_inflight(tctx, false);
if (!inflight)
break;
- io_uring_try_cancel_requests(ctx, current, NULL);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_try_cancel_requests(ctx, current, NULL);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
/*
@@ -9014,7 +9070,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
* avoids a race where a completion comes in before we did
* prepare_to_wait().
*/
- if (inflight == tctx_inflight(tctx))
+ if (inflight == tctx_inflight(tctx, false))
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
@@ -9025,7 +9081,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
* Find any io_uring fd that this task has registered or done IO on, and cancel
* requests.
*/
-void __io_uring_task_cancel(void)
+void __io_uring_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
DEFINE_WAIT(wait);
@@ -9033,15 +9089,12 @@ void __io_uring_task_cancel(void)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
- __io_uring_files_cancel(NULL);
-
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx);
+ inflight = tctx_inflight(tctx, !!files);
if (!inflight)
break;
- __io_uring_files_cancel(NULL);
-
+ io_uring_try_cancel(files);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
/*
@@ -9049,16 +9102,17 @@ void __io_uring_task_cancel(void)
* avoids a race where a completion comes in before we did
* prepare_to_wait().
*/
- if (inflight == tctx_inflight(tctx))
+ if (inflight == tctx_inflight(tctx, !!files))
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
-
atomic_dec(&tctx->in_idle);
io_uring_clean_tctx(tctx);
- /* all current's requests should be gone, we can kill tctx */
- __io_uring_free(current);
+ if (!files) {
+ /* for exec all current's requests should be gone, kill tctx */
+ __io_uring_free(current);
+ }
}
static void *io_uring_validate_mmap_request(struct file *file,
@@ -9184,31 +9238,31 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz)
{
struct io_ring_ctx *ctx;
- long ret = -EBADF;
int submitted = 0;
struct fd f;
+ long ret;
io_run_task_work();
- if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
+ if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
return -EINVAL;
f = fdget(fd);
- if (!f.file)
+ if (unlikely(!f.file))
return -EBADF;
ret = -EOPNOTSUPP;
- if (f.file->f_op != &io_uring_fops)
+ if (unlikely(f.file->f_op != &io_uring_fops))
goto out_fput;
ret = -ENXIO;
ctx = f.file->private_data;
- if (!percpu_ref_tryget(&ctx->refs))
+ if (unlikely(!percpu_ref_tryget(&ctx->refs)))
goto out_fput;
ret = -EBADFD;
- if (ctx->flags & IORING_SETUP_R_DISABLED)
+ if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
goto out;
/*
@@ -9218,7 +9272,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_cqring_overflow_flush(ctx, false);
ret = -EOWNERDEAD;
if (unlikely(ctx->sq_data->thread == NULL)) {
@@ -9331,7 +9385,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct file *f = *io_fixed_file_slot(ctx->file_data, i);
+ struct file *f = io_file_from_index(ctx, i);
if (f)
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
@@ -9340,10 +9394,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+ struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+ unsigned int len = buf->ubuf_end - buf->ubuf;
- seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
- (unsigned int) buf->len);
+ seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
}
if (has_lock && !xa_empty(&ctx->personalities)) {
unsigned long index;
@@ -9552,6 +9606,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
+ /* always set a rsrc node */
+ io_rsrc_node_switch_start(ctx);
+ io_rsrc_node_switch(ctx, NULL);
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
@@ -9777,14 +9834,96 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
return 0;
}
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+ struct io_uring_rsrc_update2 *up,
+ unsigned nr_args)
+{
+ __u32 tmp;
+ int err;
+
+ if (up->resv)
+ return -EINVAL;
+ if (check_add_overflow(up->offset, nr_args, &tmp))
+ return -EOVERFLOW;
+ err = io_rsrc_node_switch_start(ctx);
+ if (err)
+ return err;
+
+ switch (type) {
+ case IORING_RSRC_FILE:
+ return __io_sqe_files_update(ctx, up, nr_args);
+ case IORING_RSRC_BUFFER:
+ return __io_sqe_buffers_update(ctx, up, nr_args);
+ }
+ return -EINVAL;
+}
+
+static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update2 up;
+
+ if (!nr_args)
+ return -EINVAL;
+ memset(&up, 0, sizeof(up));
+ if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+ return -EFAULT;
+ return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+}
+
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned size)
+{
+ struct io_uring_rsrc_update2 up;
+
+ if (size != sizeof(up))
+ return -EINVAL;
+ if (copy_from_user(&up, arg, sizeof(up)))
+ return -EFAULT;
+ if (!up.nr)
+ return -EINVAL;
+ return __io_register_rsrc_update(ctx, up.type, &up, up.nr);
+}
+
+static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int size)
+{
+ struct io_uring_rsrc_register rr;
+
+ /* keep it extendible */
+ if (size != sizeof(rr))
+ return -EINVAL;
+
+ memset(&rr, 0, sizeof(rr));
+ if (copy_from_user(&rr, arg, size))
+ return -EFAULT;
+ if (!rr.nr)
+ return -EINVAL;
+
+ switch (rr.type) {
+ case IORING_RSRC_FILE:
+ return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+ rr.nr, u64_to_user_ptr(rr.tags));
+ case IORING_RSRC_BUFFER:
+ return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+ rr.nr, u64_to_user_ptr(rr.tags));
+ }
+ return -EINVAL;
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
+ case IORING_REGISTER_BUFFERS:
+ case IORING_UNREGISTER_BUFFERS:
+ case IORING_REGISTER_FILES:
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
+ case IORING_REGISTER_RSRC:
+ case IORING_REGISTER_RSRC_UPDATE:
return false;
default:
return true;
@@ -9806,6 +9945,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (percpu_ref_is_dying(&ctx->refs))
return -ENXIO;
+ if (ctx->restricted) {
+ if (opcode >= IORING_REGISTER_LAST)
+ return -EINVAL;
+ opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
+ if (!test_bit(opcode, ctx->restrictions.register_op))
+ return -EACCES;
+ }
+
if (io_register_op_must_quiesce(opcode)) {
percpu_ref_kill(&ctx->refs);
@@ -9826,30 +9973,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (ret < 0)
break;
} while (1);
-
mutex_lock(&ctx->uring_lock);
if (ret) {
- percpu_ref_resurrect(&ctx->refs);
- goto out_quiesce;
- }
- }
-
- if (ctx->restricted) {
- if (opcode >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- if (!test_bit(opcode, ctx->restrictions.register_op)) {
- ret = -EACCES;
- goto out;
+ io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+ return ret;
}
}
switch (opcode) {
case IORING_REGISTER_BUFFERS:
- ret = io_sqe_buffers_register(ctx, arg, nr_args);
+ ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
@@ -9858,7 +9992,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
- ret = io_sqe_files_register(ctx, arg, nr_args);
+ ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
ret = -EINVAL;
@@ -9867,7 +10001,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_files_unregister(ctx);
break;
case IORING_REGISTER_FILES_UPDATE:
- ret = io_sqe_files_update(ctx, arg, nr_args);
+ ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
case IORING_REGISTER_EVENTFD_ASYNC:
@@ -9915,16 +10049,20 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_RESTRICTIONS:
ret = io_register_restrictions(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_RSRC:
+ ret = io_register_rsrc(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_RSRC_UPDATE:
+ ret = io_register_rsrc_update(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
-out:
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
-out_quiesce:
reinit_completion(&ctx->ref_comp);
}
return ret;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 79cde9906be0..04b650bcbbe5 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -7,19 +7,17 @@
#if defined(CONFIG_IO_URING)
struct sock *io_uring_get_socket(struct file *file);
-void __io_uring_task_cancel(void);
-void __io_uring_files_cancel(struct files_struct *files);
+void __io_uring_cancel(struct files_struct *files);
void __io_uring_free(struct task_struct *tsk);
-static inline void io_uring_task_cancel(void)
+static inline void io_uring_files_cancel(struct files_struct *files)
{
if (current->io_uring)
- __io_uring_task_cancel();
+ __io_uring_cancel(files);
}
-static inline void io_uring_files_cancel(struct files_struct *files)
+static inline void io_uring_task_cancel(void)
{
- if (current->io_uring)
- __io_uring_files_cancel(files);
+ return io_uring_files_cancel(NULL);
}
static inline void io_uring_free(struct task_struct *tsk)
{
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 0d848a1e9e62..5b8a93f288bb 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -22,6 +22,8 @@ enum task_work_notify_mode {
int task_work_add(struct task_struct *task, struct callback_head *twork,
enum task_work_notify_mode mode);
+struct callback_head *task_work_cancel_match(struct task_struct *task,
+ bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
void task_work_run(void);
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 9f0d3b7d56b0..bd528176a3d5 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -290,29 +290,32 @@ TRACE_EVENT(io_uring_fail_link,
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: result of the request
+ * @cflags: completion flags
*
*/
TRACE_EVENT(io_uring_complete,
- TP_PROTO(void *ctx, u64 user_data, long res),
+ TP_PROTO(void *ctx, u64 user_data, long res, unsigned cflags),
- TP_ARGS(ctx, user_data, res),
+ TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( long, res )
+ __field( unsigned, cflags )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
+ __entry->cflags = cflags;
),
- TP_printk("ring %p, user_data 0x%llx, result %ld",
+ TP_printk("ring %p, user_data 0x%llx, result %ld, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data,
- __entry->res)
+ __entry->res, __entry->cflags)
);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2514eb6b1cf2..e1ae46683301 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -160,6 +160,21 @@ enum {
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
+ * the poll handler will continue to report
+ * CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE Update existing poll request, matching
+ * sqe->addr as the old user_data field.
+ */
+#define IORING_POLL_ADD_MULTI (1U << 0)
+#define IORING_POLL_UPDATE_EVENTS (1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
+
+/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
@@ -172,8 +187,10 @@ struct io_uring_cqe {
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
*/
#define IORING_CQE_F_BUFFER (1U << 0)
+#define IORING_CQE_F_MORE (1U << 1)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
@@ -281,6 +298,8 @@ enum {
IORING_UNREGISTER_PERSONALITY = 10,
IORING_REGISTER_RESTRICTIONS = 11,
IORING_REGISTER_ENABLE_RINGS = 12,
+ IORING_REGISTER_RSRC = 13,
+ IORING_REGISTER_RSRC_UPDATE = 14,
/* this goes last */
IORING_REGISTER_LAST
@@ -293,12 +312,33 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+enum {
+ IORING_RSRC_FILE = 0,
+ IORING_RSRC_BUFFER = 1,
+};
+
+struct io_uring_rsrc_register {
+ __u32 type;
+ __u32 nr;
+ __aligned_u64 data;
+ __aligned_u64 tags;
+};
+
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
__aligned_u64 data;
};
+struct io_uring_rsrc_update2 {
+ __u32 offset;
+ __u32 resv;
+ __aligned_u64 data;
+ __aligned_u64 tags;
+ __u32 type;
+ __u32 nr;
+};
+
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
diff --git a/kernel/fork.c b/kernel/fork.c
index f98de491f013..0f1992d3f80b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -927,6 +927,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
+ tsk->pf_io_worker = NULL;
account_kernel_stack(tsk, 1);
@@ -1941,7 +1942,7 @@ static __latent_entropy struct task_struct *copy_process(
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
retval = -ERESTARTNOINTR;
- if (signal_pending(current))
+ if (task_sigpending(current))
goto fork_out;
retval = -ENOMEM;
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 9cde961875c0..e9316198c64b 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -59,18 +59,17 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
}
/**
- * task_work_cancel - cancel a pending work added by task_work_add()
+ * task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
- * @func: identifies the work to remove
- *
- * Find the last queued pending work with ->func == @func and remove
- * it from queue.
+ * @match: match function to call
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_match(struct task_struct *task,
+ bool (*match)(struct callback_head *, void *data),
+ void *data)
{
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
@@ -86,7 +85,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
while ((work = READ_ONCE(*pprev))) {
- if (work->func != func)
+ if (!match(work, data))
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
break;
@@ -96,6 +95,28 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
return work;
}
+static bool task_work_func_match(struct callback_head *cb, void *data)
+{
+ return cb->func == data;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+ return task_work_cancel_match(task, task_work_func_match, func);
+}
+
/**
* task_work_run - execute the works added by task_work_add()
*