From d63d1b5edb7b832210bfde587ba9e7549fa064eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 10 Dec 2019 10:38:56 -0700
Subject: io_uring: add support for fallocate()

This exposes fallocate(2) through io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5953d7f13690..7a4e00ef02be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -319,6 +319,7 @@ struct io_sync {
 	loff_t				len;
 	loff_t				off;
 	int				flags;
+	int				mode;
 };
 
 struct io_cancel {
@@ -2101,6 +2102,54 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static void io_fallocate_finish(struct io_wq_work **workptr)
+{
+	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+	struct io_kiocb *nxt = NULL;
+	int ret;
+
+	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+				req->sync.len);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, &nxt);
+	if (nxt)
+		io_wq_assign_next(workptr, nxt);
+}
+
+static int io_fallocate_prep(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe)
+{
+	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+		return -EINVAL;
+
+	req->sync.off = READ_ONCE(sqe->off);
+	req->sync.len = READ_ONCE(sqe->addr);
+	req->sync.mode = READ_ONCE(sqe->len);
+	return 0;
+}
+
+static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
+			bool force_nonblock)
+{
+	struct io_wq_work *work, *old_work;
+
+	/* fallocate always requiring blocking context */
+	if (force_nonblock) {
+		io_put_req(req);
+		req->work.func = io_fallocate_finish;
+		return -EAGAIN;
+	}
+
+	work = old_work = &req->work;
+	io_fallocate_finish(&work);
+	if (work && work != old_work)
+		*nxt = container_of(work, struct io_kiocb, work);
+
+	return 0;
+}
+
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3123,6 +3172,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_ACCEPT:
 		ret = io_accept_prep(req, sqe);
 		break;
+	case IORING_OP_FALLOCATE:
+		ret = io_fallocate_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3277,6 +3329,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_async_cancel(req, nxt);
 		break;
+	case IORING_OP_FALLOCATE:
+		if (sqe) {
+			ret = io_fallocate_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_fallocate(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit 


From 15b71abe7b52df214785dde0de9f581cc0216d17 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 Dec 2019 11:20:36 -0700
Subject: io_uring: add support for IORING_OP_OPENAT

This works just like openat(2), except it can be performed async. For
the normal case of a non-blocking path lookup this will complete
inline. If we have to do IO to perform the open, it'll be done from
async context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7a4e00ef02be..34cbce622fcd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -70,6 +70,8 @@
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
 #include <linux/highmem.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -353,6 +355,15 @@ struct io_sr_msg {
 	int				msg_flags;
 };
 
+struct io_open {
+	struct file			*file;
+	int				dfd;
+	umode_t				mode;
+	const char __user		*fname;
+	struct filename			*filename;
+	int				flags;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -371,12 +382,17 @@ struct io_async_rw {
 	ssize_t				size;
 };
 
+struct io_async_open {
+	struct filename			*filename;
+};
+
 struct io_async_ctx {
 	union {
 		struct io_async_rw	rw;
 		struct io_async_msghdr	msg;
 		struct io_async_connect	connect;
 		struct io_timeout_data	timeout;
+		struct io_async_open	open;
 	};
 };
 
@@ -397,6 +413,7 @@ struct io_kiocb {
 		struct io_timeout	timeout;
 		struct io_connect	connect;
 		struct io_sr_msg	sr_msg;
+		struct io_open		open;
 	};
 
 	struct io_async_ctx		*io;
@@ -2150,6 +2167,67 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	int ret;
+
+	if (sqe->ioprio || sqe->buf_index)
+		return -EINVAL;
+
+	req->open.dfd = READ_ONCE(sqe->fd);
+	req->open.mode = READ_ONCE(sqe->len);
+	req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	req->open.flags = READ_ONCE(sqe->open_flags);
+
+	req->open.filename = getname(req->open.fname);
+	if (IS_ERR(req->open.filename)) {
+		ret = PTR_ERR(req->open.filename);
+		req->open.filename = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
+		     bool force_nonblock)
+{
+	struct open_flags op;
+	struct open_how how;
+	struct file *file;
+	int ret;
+
+	if (force_nonblock) {
+		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+		return -EAGAIN;
+	}
+
+	how = build_open_how(req->open.flags, req->open.mode);
+	ret = build_open_flags(&how, &op);
+	if (ret)
+		goto err;
+
+	ret = get_unused_fd_flags(how.flags);
+	if (ret < 0)
+		goto err;
+
+	file = do_filp_open(req->open.dfd, req->open.filename, &op);
+	if (IS_ERR(file)) {
+		put_unused_fd(ret);
+		ret = PTR_ERR(file);
+	} else {
+		fsnotify_open(file);
+		fd_install(ret, file);
+	}
+err:
+	putname(req->open.filename);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+}
+
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3175,6 +3253,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_FALLOCATE:
 		ret = io_fallocate_prep(req, sqe);
 		break;
+	case IORING_OP_OPENAT:
+		ret = io_openat_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3337,6 +3418,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_fallocate(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_OPENAT:
+		if (sqe) {
+			ret = io_openat_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_openat(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3409,7 +3498,7 @@ static bool io_req_op_valid(int op)
 	return op >= IORING_OP_NOP && op < IORING_OP_LAST;
 }
 
-static int io_req_needs_file(struct io_kiocb *req)
+static int io_req_needs_file(struct io_kiocb *req, int fd)
 {
 	switch (req->opcode) {
 	case IORING_OP_NOP:
@@ -3419,6 +3508,8 @@ static int io_req_needs_file(struct io_kiocb *req)
 	case IORING_OP_ASYNC_CANCEL:
 	case IORING_OP_LINK_TIMEOUT:
 		return 0;
+	case IORING_OP_OPENAT:
+		return fd != -1;
 	default:
 		if (io_req_op_valid(req->opcode))
 			return 1;
@@ -3448,7 +3539,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 	if (flags & IOSQE_IO_DRAIN)
 		req->flags |= REQ_F_IO_DRAIN;
 
-	ret = io_req_needs_file(req);
+	ret = io_req_needs_file(req, fd);
 	if (ret <= 0)
 		return ret;
 
-- 
cgit 


From 0c9d5ccd26a004f59333c06fbbb98f9cb1eed93d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 Dec 2019 19:29:43 -0700
Subject: io-wq: add support for uncancellable work

Not all work can be cancelled, some of it we may need to guarantee
that it runs to completion. Allow the caller to set IO_WQ_WORK_NO_CANCEL
on work that must not be cancelled. Note that the caller work function
must also check for IO_WQ_WORK_NO_CANCEL on work that is marked
IO_WQ_WORK_CANCEL.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 34cbce622fcd..fe227650efd6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3460,8 +3460,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	struct io_kiocb *nxt = NULL;
 	int ret = 0;
 
-	if (work->flags & IO_WQ_WORK_CANCEL)
+	/* if NO_CANCEL is set, we must still run the work */
+	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
+				IO_WQ_WORK_CANCEL) {
 		ret = -ECANCELED;
+	}
 
 	if (!ret) {
 		req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
-- 
cgit 


From b5dba59e0cf7e2cc4d3b3b1ac5fe81ddf21959eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 Dec 2019 14:02:38 -0700
Subject: io_uring: add support for IORING_OP_CLOSE

This works just like close(2), unsurprisingly. We remove the file
descriptor and post the completion inline, then offload the actual
(potential) last file put to async context.

Mark the async part of this work as uncancellable, as we really must
guarantee that the latter part of the close is run.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fe227650efd6..6aaff7bfe8b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -301,6 +301,12 @@ struct io_poll_iocb {
 	struct wait_queue_entry		wait;
 };
 
+struct io_close {
+	struct file			*file;
+	struct file			*put_file;
+	int				fd;
+};
+
 struct io_timeout_data {
 	struct io_kiocb			*req;
 	struct hrtimer			timer;
@@ -414,6 +420,7 @@ struct io_kiocb {
 		struct io_connect	connect;
 		struct io_sr_msg	sr_msg;
 		struct io_open		open;
+		struct io_close		close;
 	};
 
 	struct io_async_ctx		*io;
@@ -2228,6 +2235,94 @@ err:
 	return 0;
 }
 
+static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	/*
+	 * If we queue this for async, it must not be cancellable. That would
+	 * leave the 'file' in an undeterminate state.
+	 */
+	req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+
+	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
+	    sqe->rw_flags || sqe->buf_index)
+		return -EINVAL;
+	if (sqe->flags & IOSQE_FIXED_FILE)
+		return -EINVAL;
+
+	req->close.fd = READ_ONCE(sqe->fd);
+	if (req->file->f_op == &io_uring_fops ||
+	    req->close.fd == req->ring_fd)
+		return -EBADF;
+
+	return 0;
+}
+
+static void io_close_finish(struct io_wq_work **workptr)
+{
+	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+	struct io_kiocb *nxt = NULL;
+
+	/* Invoked with files, we need to do the close */
+	if (req->work.files) {
+		int ret;
+
+		ret = filp_close(req->close.put_file, req->work.files);
+		if (ret < 0) {
+			req_set_fail_links(req);
+		}
+		io_cqring_add_event(req, ret);
+	}
+
+	fput(req->close.put_file);
+
+	/* we bypassed the re-issue, drop the submission reference */
+	io_put_req(req);
+	io_put_req_find_next(req, &nxt);
+	if (nxt)
+		io_wq_assign_next(workptr, nxt);
+}
+
+static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
+		    bool force_nonblock)
+{
+	int ret;
+
+	req->close.put_file = NULL;
+	ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
+	if (ret < 0)
+		return ret;
+
+	/* if the file has a flush method, be safe and punt to async */
+	if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) {
+		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+		goto eagain;
+	}
+
+	/*
+	 * No ->flush(), safely close from here and just punt the
+	 * fput() to async context.
+	 */
+	ret = filp_close(req->close.put_file, current->files);
+
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+
+	if (io_wq_current_is_worker()) {
+		struct io_wq_work *old_work, *work;
+
+		old_work = work = &req->work;
+		io_close_finish(&work);
+		if (work && work != old_work)
+			*nxt = container_of(work, struct io_kiocb, work);
+		return 0;
+	}
+
+eagain:
+	req->work.func = io_close_finish;
+	return -EAGAIN;
+}
+
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3256,6 +3351,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_OPENAT:
 		ret = io_openat_prep(req, sqe);
 		break;
+	case IORING_OP_CLOSE:
+		ret = io_close_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3426,6 +3524,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_openat(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_CLOSE:
+		if (sqe) {
+			ret = io_close_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_close(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3572,6 +3678,9 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (!req->ring_file)
+		return -EBADF;
+
 	rcu_read_lock();
 	spin_lock_irq(&ctx->inflight_lock);
 	/*
-- 
cgit 


From 05f3fb3c5397524feae2e73ee8e150a9090a7da2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 9 Dec 2019 11:22:50 -0700
Subject: io_uring: avoid ring quiesce for fixed file set unregister and update

We currently fully quiesce the ring before an unregister or update of
the fixed fileset. This is very expensive, and we can be a bit smarter
about this.

Add a percpu refcount for the file tables as a whole. Grab a percpu ref
when we use a registered file, and put it on completion. This is cheap
to do. Upon removal of a file from a set, switch the ref count to atomic
mode. When we hit zero ref on the completion side, then we know we can
drop the previously registered files. When the old files have been
dropped, switch the ref back to percpu mode for normal operation.

Since there's a period between doing the update and the kernel being
done with it, add a IORING_OP_FILES_UPDATE opcode that can perform the
same action. The application knows the update has completed when it gets
the CQE for it. Between doing the update and receiving this completion,
the application must continue to use the unregistered fd if submitting
IO on this particular file.

This takes the runtime of test/file-register from liburing from 14s to
about 0.7s.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 485 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 350 insertions(+), 135 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6aaff7bfe8b5..4325068324b7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -179,6 +179,21 @@ struct fixed_file_table {
 	struct file		**files;
 };
 
+enum {
+	FFD_F_ATOMIC,
+};
+
+struct fixed_file_data {
+	struct fixed_file_table		*table;
+	struct io_ring_ctx		*ctx;
+
+	struct percpu_ref		refs;
+	struct llist_head		put_llist;
+	unsigned long			state;
+	struct work_struct		ref_work;
+	struct completion		done;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -231,7 +246,7 @@ struct io_ring_ctx {
 	 * readers must ensure that ->refs is alive as long as the file* is
 	 * used. Only updated through io_uring_register(2).
 	 */
-	struct fixed_file_table	*file_table;
+	struct fixed_file_data	*file_data;
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
@@ -370,6 +385,13 @@ struct io_open {
 	int				flags;
 };
 
+struct io_files_update {
+	struct file			*file;
+	u64				arg;
+	u32				nr_args;
+	u32				offset;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -421,6 +443,7 @@ struct io_kiocb {
 		struct io_sr_msg	sr_msg;
 		struct io_open		open;
 		struct io_close		close;
+		struct io_files_update	files_update;
 	};
 
 	struct io_async_ctx		*io;
@@ -496,6 +519,9 @@ static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+				 struct io_uring_files_update *ip,
+				 unsigned nr_args);
 
 static struct kmem_cache *req_cachep;
 
@@ -945,6 +971,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 	if (*nr) {
 		kmem_cache_free_bulk(req_cachep, *nr, reqs);
 		percpu_ref_put_many(&ctx->refs, *nr);
+		percpu_ref_put_many(&ctx->file_data->refs, *nr);
 		*nr = 0;
 	}
 }
@@ -955,8 +982,12 @@ static void __io_free_req(struct io_kiocb *req)
 
 	if (req->io)
 		kfree(req->io);
-	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
-		fput(req->file);
+	if (req->file) {
+		if (req->flags & REQ_F_FIXED_FILE)
+			percpu_ref_put(&ctx->file_data->refs);
+		else
+			fput(req->file);
+	}
 	if (req->flags & REQ_F_INFLIGHT) {
 		unsigned long flags;
 
@@ -3293,6 +3324,45 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
 	return 0;
 }
 
+static int io_files_update_prep(struct io_kiocb *req,
+				const struct io_uring_sqe *sqe)
+{
+	if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+		return -EINVAL;
+
+	req->files_update.offset = READ_ONCE(sqe->off);
+	req->files_update.nr_args = READ_ONCE(sqe->len);
+	if (!req->files_update.nr_args)
+		return -EINVAL;
+	req->files_update.arg = READ_ONCE(sqe->addr);
+	return 0;
+}
+
+static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_uring_files_update up;
+	int ret;
+
+	if (force_nonblock) {
+		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+		return -EAGAIN;
+	}
+
+	up.offset = req->files_update.offset;
+	up.fds = req->files_update.arg;
+
+	mutex_lock(&ctx->uring_lock);
+	ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+	mutex_unlock(&ctx->uring_lock);
+
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req(req);
+	return 0;
+}
+
 static int io_req_defer_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
@@ -3354,6 +3424,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_CLOSE:
 		ret = io_close_prep(req, sqe);
 		break;
+	case IORING_OP_FILES_UPDATE:
+		ret = io_files_update_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3532,6 +3605,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_close(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_FILES_UPDATE:
+		if (sqe) {
+			ret = io_files_update_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_files_update(req, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3631,8 +3712,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 {
 	struct fixed_file_table *table;
 
-	table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
-	return table->files[index & IORING_FILE_TABLE_MASK];
+	table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
+	return table->files[index & IORING_FILE_TABLE_MASK];;
 }
 
 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -3653,7 +3734,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 		return ret;
 
 	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->file_table ||
+		if (unlikely(!ctx->file_data ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
 		fd = array_index_nospec(fd, ctx->nr_user_files);
@@ -3661,6 +3742,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 		if (!req->file)
 			return -EBADF;
 		req->flags |= REQ_F_FIXED_FILE;
+		percpu_ref_get(&ctx->file_data->refs);
 	} else {
 		if (req->needs_fixed_file)
 			return -EBADF;
@@ -4338,19 +4420,37 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 #endif
 }
 
+static void io_file_ref_kill(struct percpu_ref *ref)
+{
+	struct fixed_file_data *data;
+
+	data = container_of(ref, struct fixed_file_data, refs);
+	complete(&data->done);
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
+	struct fixed_file_data *data = ctx->file_data;
 	unsigned nr_tables, i;
 
-	if (!ctx->file_table)
+	if (!data)
 		return -ENXIO;
 
+	/* protect against inflight atomic switch, which drops the ref */
+	flush_work(&data->ref_work);
+	percpu_ref_get(&data->refs);
+	percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
+	wait_for_completion(&data->done);
+	percpu_ref_put(&data->refs);
+	percpu_ref_exit(&data->refs);
+
 	__io_sqe_files_unregister(ctx);
 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
 	for (i = 0; i < nr_tables; i++)
-		kfree(ctx->file_table[i].files);
-	kfree(ctx->file_table);
-	ctx->file_table = NULL;
+		kfree(data->table[i].files);
+	kfree(data->table);
+	kfree(data);
+	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
 }
@@ -4381,16 +4481,6 @@ static void io_finish_async(struct io_ring_ctx *ctx)
 }
 
 #if defined(CONFIG_UNIX)
-static void io_destruct_skb(struct sk_buff *skb)
-{
-	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
-
-	if (ctx->io_wq)
-		io_wq_flush(ctx->io_wq);
-
-	unix_destruct_scm(skb);
-}
-
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
  * the io_uring can be safely unregistered on process exit, even if we have
@@ -4438,7 +4528,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 		fpl->max = SCM_MAX_FD;
 		fpl->count = nr_files;
 		UNIXCB(skb).fp = fpl;
-		skb->destructor = io_destruct_skb;
+		skb->destructor = unix_destruct_scm;
 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 		skb_queue_head(&sk->sk_receive_queue, skb);
 
@@ -4500,7 +4590,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
 	int i;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_file_table *table = &ctx->file_table[i];
+		struct fixed_file_table *table = &ctx->file_data->table[i];
 		unsigned this_files;
 
 		this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -4515,36 +4605,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
 		return 0;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_file_table *table = &ctx->file_table[i];
+		struct fixed_file_table *table = &ctx->file_data->table[i];
 		kfree(table->files);
 	}
 	return 1;
 }
 
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+{
+#if defined(CONFIG_UNIX)
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head list, *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+	int i;
+
+	__skb_queue_head_init(&list);
+
+	/*
+	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
+	 * remove this entry and rearrange the file array.
+	 */
+	skb = skb_dequeue(head);
+	while (skb) {
+		struct scm_fp_list *fp;
+
+		fp = UNIXCB(skb).fp;
+		for (i = 0; i < fp->count; i++) {
+			int left;
+
+			if (fp->fp[i] != file)
+				continue;
+
+			unix_notinflight(fp->user, fp->fp[i]);
+			left = fp->count - 1 - i;
+			if (left) {
+				memmove(&fp->fp[i], &fp->fp[i + 1],
+						left * sizeof(struct file *));
+			}
+			fp->count--;
+			if (!fp->count) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else {
+				__skb_queue_tail(&list, skb);
+			}
+			fput(file);
+			file = NULL;
+			break;
+		}
+
+		if (!file)
+			break;
+
+		__skb_queue_tail(&list, skb);
+
+		skb = skb_dequeue(head);
+	}
+
+	if (skb_peek(&list)) {
+		spin_lock_irq(&head->lock);
+		while ((skb = __skb_dequeue(&list)) != NULL)
+			__skb_queue_tail(head, skb);
+		spin_unlock_irq(&head->lock);
+	}
+#else
+	fput(file);
+#endif
+}
+
+struct io_file_put {
+	struct llist_node llist;
+	struct file *file;
+	struct completion *done;
+};
+
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+	struct io_file_put *pfile, *tmp;
+	struct fixed_file_data *data;
+	struct llist_node *node;
+
+	data = container_of(work, struct fixed_file_data, ref_work);
+
+	while ((node = llist_del_all(&data->put_llist)) != NULL) {
+		llist_for_each_entry_safe(pfile, tmp, node, llist) {
+			io_ring_file_put(data->ctx, pfile->file);
+			if (pfile->done)
+				complete(pfile->done);
+			else
+				kfree(pfile);
+		}
+	}
+
+	percpu_ref_get(&data->refs);
+	percpu_ref_switch_to_percpu(&data->refs);
+}
+
+static void io_file_data_ref_zero(struct percpu_ref *ref)
+{
+	struct fixed_file_data *data;
+
+	data = container_of(ref, struct fixed_file_data, refs);
+
+	/* we can't safely switch from inside this context, punt to wq */
+	queue_work(system_wq, &data->ref_work);
+}
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
 	__s32 __user *fds = (__s32 __user *) arg;
 	unsigned nr_tables;
+	struct file *file;
 	int fd, ret = 0;
 	unsigned i;
 
-	if (ctx->file_table)
+	if (ctx->file_data)
 		return -EBUSY;
 	if (!nr_args)
 		return -EINVAL;
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
 
+	ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+	if (!ctx->file_data)
+		return -ENOMEM;
+	ctx->file_data->ctx = ctx;
+	init_completion(&ctx->file_data->done);
+
 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
-	ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+	ctx->file_data->table = kcalloc(nr_tables,
+					sizeof(struct fixed_file_table),
 					GFP_KERNEL);
-	if (!ctx->file_table)
+	if (!ctx->file_data->table) {
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
 		return -ENOMEM;
+	}
+
+	if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+				PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+		kfree(ctx->file_data->table);
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
+		return -ENOMEM;
+	}
+	ctx->file_data->put_llist.first = NULL;
+	INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
 
 	if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
-		kfree(ctx->file_table);
-		ctx->file_table = NULL;
+		percpu_ref_exit(&ctx->file_data->refs);
+		kfree(ctx->file_data->table);
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
 		return -ENOMEM;
 	}
 
@@ -4561,13 +4774,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			continue;
 		}
 
-		table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
 		index = i & IORING_FILE_TABLE_MASK;
-		table->files[index] = fget(fd);
+		file = fget(fd);
 
 		ret = -EBADF;
-		if (!table->files[index])
+		if (!file)
 			break;
+
 		/*
 		 * Don't allow io_uring instances to be registered. If UNIX
 		 * isn't enabled, then this causes a reference cycle and this
@@ -4575,26 +4789,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		 * handle it just fine, but there's still no point in allowing
 		 * a ring fd as it doesn't support regular read/write anyway.
 		 */
-		if (table->files[index]->f_op == &io_uring_fops) {
-			fput(table->files[index]);
+		if (file->f_op == &io_uring_fops) {
+			fput(file);
 			break;
 		}
 		ret = 0;
+		table->files[index] = file;
 	}
 
 	if (ret) {
 		for (i = 0; i < ctx->nr_user_files; i++) {
-			struct file *file;
-
 			file = io_file_from_index(ctx, i);
 			if (file)
 				fput(file);
 		}
 		for (i = 0; i < nr_tables; i++)
-			kfree(ctx->file_table[i].files);
+			kfree(ctx->file_data->table[i].files);
 
-		kfree(ctx->file_table);
-		ctx->file_table = NULL;
+		kfree(ctx->file_data->table);
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
 		ctx->nr_user_files = 0;
 		return ret;
 	}
@@ -4606,69 +4820,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
-{
-#if defined(CONFIG_UNIX)
-	struct file *file = io_file_from_index(ctx, index);
-	struct sock *sock = ctx->ring_sock->sk;
-	struct sk_buff_head list, *head = &sock->sk_receive_queue;
-	struct sk_buff *skb;
-	int i;
-
-	__skb_queue_head_init(&list);
-
-	/*
-	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
-	 * remove this entry and rearrange the file array.
-	 */
-	skb = skb_dequeue(head);
-	while (skb) {
-		struct scm_fp_list *fp;
-
-		fp = UNIXCB(skb).fp;
-		for (i = 0; i < fp->count; i++) {
-			int left;
-
-			if (fp->fp[i] != file)
-				continue;
-
-			unix_notinflight(fp->user, fp->fp[i]);
-			left = fp->count - 1 - i;
-			if (left) {
-				memmove(&fp->fp[i], &fp->fp[i + 1],
-						left * sizeof(struct file *));
-			}
-			fp->count--;
-			if (!fp->count) {
-				kfree_skb(skb);
-				skb = NULL;
-			} else {
-				__skb_queue_tail(&list, skb);
-			}
-			fput(file);
-			file = NULL;
-			break;
-		}
-
-		if (!file)
-			break;
-
-		__skb_queue_tail(&list, skb);
-
-		skb = skb_dequeue(head);
-	}
-
-	if (skb_peek(&list)) {
-		spin_lock_irq(&head->lock);
-		while ((skb = __skb_dequeue(&list)) != NULL)
-			__skb_queue_tail(head, skb);
-		spin_unlock_irq(&head->lock);
-	}
-#else
-	fput(io_file_from_index(ctx, index));
-#endif
-}
-
 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 				int index)
 {
@@ -4712,29 +4863,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
-			       unsigned nr_args)
+static void io_atomic_switch(struct percpu_ref *ref)
 {
-	struct io_uring_files_update up;
+	struct fixed_file_data *data;
+
+	data = container_of(ref, struct fixed_file_data, refs);
+	clear_bit(FFD_F_ATOMIC, &data->state);
+}
+
+static bool io_queue_file_removal(struct fixed_file_data *data,
+				  struct file *file)
+{
+	struct io_file_put *pfile, pfile_stack;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	/*
+	 * If we fail allocating the struct we need for doing async reomval
+	 * of this file, just punt to sync and wait for it.
+	 */
+	pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
+	if (!pfile) {
+		pfile = &pfile_stack;
+		pfile->done = &done;
+	}
+
+	pfile->file = file;
+	llist_add(&pfile->llist, &data->put_llist);
+
+	if (pfile == &pfile_stack) {
+		if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+			percpu_ref_put(&data->refs);
+			percpu_ref_switch_to_atomic(&data->refs,
+							io_atomic_switch);
+		}
+		wait_for_completion(&done);
+		flush_work(&data->ref_work);
+		return false;
+	}
+
+	return true;
+}
+
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+				 struct io_uring_files_update *up,
+				 unsigned nr_args)
+{
+	struct fixed_file_data *data = ctx->file_data;
+	bool ref_switch = false;
+	struct file *file;
 	__s32 __user *fds;
 	int fd, i, err;
 	__u32 done;
 
-	if (!ctx->file_table)
-		return -ENXIO;
-	if (!nr_args)
-		return -EINVAL;
-	if (copy_from_user(&up, arg, sizeof(up)))
-		return -EFAULT;
-	if (up.resv)
-		return -EINVAL;
-	if (check_add_overflow(up.offset, nr_args, &done))
+	if (check_add_overflow(up->offset, nr_args, &done))
 		return -EOVERFLOW;
 	if (done > ctx->nr_user_files)
 		return -EINVAL;
 
 	done = 0;
-	fds = u64_to_user_ptr(up.fds);
+	fds = u64_to_user_ptr(up->fds);
 	while (nr_args) {
 		struct fixed_file_table *table;
 		unsigned index;
@@ -4744,16 +4931,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			err = -EFAULT;
 			break;
 		}
-		i = array_index_nospec(up.offset, ctx->nr_user_files);
-		table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+		i = array_index_nospec(up->offset, ctx->nr_user_files);
+		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
 		index = i & IORING_FILE_TABLE_MASK;
 		if (table->files[index]) {
-			io_sqe_file_unregister(ctx, i);
+			file = io_file_from_index(ctx, index);
 			table->files[index] = NULL;
+			if (io_queue_file_removal(data, file))
+				ref_switch = true;
 		}
 		if (fd != -1) {
-			struct file *file;
-
 			file = fget(fd);
 			if (!file) {
 				err = -EBADF;
@@ -4779,11 +4966,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		nr_args--;
 		done++;
-		up.offset++;
+		up->offset++;
+	}
+
+	if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+		percpu_ref_put(&data->refs);
+		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
 	}
 
 	return done ? done : err;
 }
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+			       unsigned nr_args)
+{
+	struct io_uring_files_update up;
+
+	if (!ctx->file_data)
+		return -ENXIO;
+	if (!nr_args)
+		return -EINVAL;
+	if (copy_from_user(&up, arg, sizeof(up)))
+		return -EFAULT;
+	if (up.resv)
+		return -EINVAL;
+
+	return __io_sqe_files_update(ctx, &up, nr_args);
+}
 
 static void io_put_work(struct io_wq_work *work)
 {
@@ -5546,7 +5754,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
 
 #if defined(CONFIG_UNIX)
 	ctx->ring_sock->file = file;
-	ctx->ring_sock->sk->sk_user_data = ctx;
 #endif
 	fd_install(ret, file);
 	return ret;
@@ -5710,18 +5917,22 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	if (percpu_ref_is_dying(&ctx->refs))
 		return -ENXIO;
 
-	percpu_ref_kill(&ctx->refs);
+	if (opcode != IORING_UNREGISTER_FILES &&
+	    opcode != IORING_REGISTER_FILES_UPDATE) {
+		percpu_ref_kill(&ctx->refs);
 
-	/*
-	 * Drop uring mutex before waiting for references to exit. If another
-	 * thread is currently inside io_uring_enter() it might need to grab
-	 * the uring_lock to make progress. If we hold it here across the drain
-	 * wait, then we can deadlock. It's safe to drop the mutex here, since
-	 * no new references will come in after we've killed the percpu ref.
-	 */
-	mutex_unlock(&ctx->uring_lock);
-	wait_for_completion(&ctx->completions[0]);
-	mutex_lock(&ctx->uring_lock);
+		/*
+		 * Drop uring mutex before waiting for references to exit. If
+		 * another thread is currently inside io_uring_enter() it might
+		 * need to grab the uring_lock to make progress. If we hold it
+		 * here across the drain wait, then we can deadlock. It's safe
+		 * to drop the mutex here, since no new references will come in
+		 * after we've killed the percpu ref.
+		 */
+		mutex_unlock(&ctx->uring_lock);
+		wait_for_completion(&ctx->completions[0]);
+		mutex_lock(&ctx->uring_lock);
+	}
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
@@ -5762,9 +5973,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		break;
 	}
 
-	/* bring the ctx back to life */
-	reinit_completion(&ctx->completions[0]);
-	percpu_ref_reinit(&ctx->refs);
+
+	if (opcode != IORING_UNREGISTER_FILES &&
+	    opcode != IORING_REGISTER_FILES_UPDATE) {
+		/* bring the ctx back to life */
+		reinit_completion(&ctx->completions[0]);
+		percpu_ref_reinit(&ctx->refs);
+	}
 	return ret;
 }
 
-- 
cgit 


From eddc7ef52a6b37b7ba3d1c8a8fbb63d5d9914f8a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Dec 2019 21:18:10 -0700
Subject: io_uring: add support for IORING_OP_STATX

This provides support for async statx(2) through io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4325068324b7..115c7d413372 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -379,9 +379,13 @@ struct io_sr_msg {
 struct io_open {
 	struct file			*file;
 	int				dfd;
-	umode_t				mode;
+	union {
+		umode_t			mode;
+		unsigned		mask;
+	};
 	const char __user		*fname;
 	struct filename			*filename;
+	struct statx __user		*buffer;
 	int				flags;
 };
 
@@ -2266,6 +2270,74 @@ err:
 	return 0;
 }
 
+static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	unsigned lookup_flags;
+	int ret;
+
+	if (sqe->ioprio || sqe->buf_index)
+		return -EINVAL;
+
+	req->open.dfd = READ_ONCE(sqe->fd);
+	req->open.mask = READ_ONCE(sqe->len);
+	req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	req->open.flags = READ_ONCE(sqe->statx_flags);
+
+	if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.flags))
+		return -EINVAL;
+
+	req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL);
+	if (IS_ERR(req->open.filename)) {
+		ret = PTR_ERR(req->open.filename);
+		req->open.filename = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
+		    bool force_nonblock)
+{
+	struct io_open *ctx = &req->open;
+	unsigned lookup_flags;
+	struct path path;
+	struct kstat stat;
+	int ret;
+
+	if (force_nonblock)
+		return -EAGAIN;
+
+	if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->flags))
+		return -EINVAL;
+
+retry:
+	/* filename_lookup() drops it, keep a reference */
+	ctx->filename->refcnt++;
+
+	ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
+				NULL);
+	if (ret)
+		goto err;
+
+	ret = vfs_getattr(&path, &stat, ctx->mask, ctx->flags);
+	path_put(&path);
+	if (retry_estale(ret, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+	if (!ret)
+		ret = cp_statx(&stat, ctx->buffer);
+err:
+	putname(ctx->filename);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+}
+
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	/*
@@ -3427,6 +3499,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_FILES_UPDATE:
 		ret = io_files_update_prep(req, sqe);
 		break;
+	case IORING_OP_STATX:
+		ret = io_statx_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3613,6 +3688,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_files_update(req, force_nonblock);
 		break;
+	case IORING_OP_STATX:
+		if (sqe) {
+			ret = io_statx_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_statx(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3699,6 +3782,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd)
 	case IORING_OP_LINK_TIMEOUT:
 		return 0;
 	case IORING_OP_OPENAT:
+	case IORING_OP_STATX:
 		return fd != -1;
 	default:
 		if (io_req_op_valid(req->opcode))
-- 
cgit 


From ce35a47a3a0208a77b4d31b7f2e8ed57d624093d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 17 Dec 2019 08:04:44 -0700
Subject: io_uring: add IOSQE_ASYNC

io_uring defaults to always doing inline submissions, if at all
possible. But for larger copies, even if the data is fully cached, that
can take a long time. Add an IOSQE_ASYNC flag that the application can
set on the SQE - if set, it'll ensure that we always go async for those
kinds of requests. Use the io-wq IO_WQ_WORK_CONCURRENT flag to ensure we
get the concurrency we desire for this case.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 115c7d413372..9ffcfdc6382b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -483,6 +483,7 @@ struct io_kiocb {
 #define REQ_F_INFLIGHT		16384	/* on inflight list */
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
 #define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
+#define REQ_F_FORCE_ASYNC	131072	/* IOSQE_ASYNC */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -4017,8 +4018,17 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			req_set_fail_links(req);
 			io_double_put_req(req);
 		}
-	} else
+	} else if ((req->flags & REQ_F_FORCE_ASYNC) &&
+		   !io_wq_current_is_worker()) {
+		/*
+		 * Never try inline submit of IOSQE_ASYNC is set, go straight
+		 * to async execution.
+		 */
+		req->work.flags |= IO_WQ_WORK_CONCURRENT;
+		io_queue_async_work(req);
+	} else {
 		__io_queue_sqe(req, sqe);
+	}
 }
 
 static inline void io_queue_link_head(struct io_kiocb *req)
@@ -4031,7 +4041,7 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 }
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
-				IOSQE_IO_HARDLINK)
+				IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 
 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			  struct io_submit_state *state, struct io_kiocb **link)
@@ -4044,6 +4054,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		ret = -EINVAL;
 		goto err_req;
 	}
+	if (sqe->flags & IOSQE_ASYNC)
+		req->flags |= REQ_F_FORCE_ASYNC;
 
 	ret = io_req_set_file(state, req, sqe);
 	if (unlikely(ret)) {
-- 
cgit 


From 9d76377f7e13c19441fdd066033345289f89b5fe Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 17 Dec 2019 02:22:07 +0300
Subject: io_uring: rename prev to head

Calling "prev" a head of a link is a bit misleading. Rename it

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9ffcfdc6382b..a463402e9067 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4073,10 +4073,10 @@ err_req:
 	 * conditions are true (normal request), then just queue it.
 	 */
 	if (*link) {
-		struct io_kiocb *prev = *link;
+		struct io_kiocb *head = *link;
 
 		if (sqe->flags & IOSQE_IO_DRAIN)
-			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+			head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
 
 		if (sqe->flags & IOSQE_IO_HARDLINK)
 			req->flags |= REQ_F_HARDLINK;
@@ -4089,11 +4089,11 @@ err_req:
 		ret = io_req_defer_prep(req, sqe);
 		if (ret) {
 			/* fail even hard links since we don't submit */
-			prev->flags |= REQ_F_FAIL_LINK;
+			head->flags |= REQ_F_FAIL_LINK;
 			goto err_req;
 		}
-		trace_io_uring_link(ctx, req, prev);
-		list_add_tail(&req->link_list, &prev->link_list);
+		trace_io_uring_link(ctx, req, head);
+		list_add_tail(&req->link_list, &head->link_list);
 	} else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
 		req->flags |= REQ_F_LINK;
 		if (sqe->flags & IOSQE_IO_HARDLINK)
-- 
cgit 


From 32fe525b6d10fec956cfe68f0db76839cd7f0ea5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 17 Dec 2019 22:26:58 +0300
Subject: io_uring: move *queue_link_head() from common path

Move io_queue_link_head() to links handling code in io_submit_sqe(),
so it wouldn't need extra checks and would have better data locality.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a463402e9067..165f77cfc6cb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4047,14 +4047,17 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			  struct io_submit_state *state, struct io_kiocb **link)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned int sqe_flags;
 	int ret;
 
+	sqe_flags = READ_ONCE(sqe->flags);
+
 	/* enforce forwards compatibility on users */
-	if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) {
+	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
 		ret = -EINVAL;
 		goto err_req;
 	}
-	if (sqe->flags & IOSQE_ASYNC)
+	if (sqe_flags & IOSQE_ASYNC)
 		req->flags |= REQ_F_FORCE_ASYNC;
 
 	ret = io_req_set_file(state, req, sqe);
@@ -4075,10 +4078,10 @@ err_req:
 	if (*link) {
 		struct io_kiocb *head = *link;
 
-		if (sqe->flags & IOSQE_IO_DRAIN)
+		if (sqe_flags & IOSQE_IO_DRAIN)
 			head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
 
-		if (sqe->flags & IOSQE_IO_HARDLINK)
+		if (sqe_flags & IOSQE_IO_HARDLINK)
 			req->flags |= REQ_F_HARDLINK;
 
 		if (io_alloc_async_ctx(req)) {
@@ -4094,9 +4097,15 @@ err_req:
 		}
 		trace_io_uring_link(ctx, req, head);
 		list_add_tail(&req->link_list, &head->link_list);
-	} else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
+
+		/* last request of a link, enqueue the link */
+		if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
+			io_queue_link_head(head);
+			*link = NULL;
+		}
+	} else if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
 		req->flags |= REQ_F_LINK;
-		if (sqe->flags & IOSQE_IO_HARDLINK)
+		if (sqe_flags & IOSQE_IO_HARDLINK)
 			req->flags |= REQ_F_HARDLINK;
 
 		INIT_LIST_HEAD(&req->link_list);
@@ -4221,7 +4230,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	for (i = 0; i < nr; i++) {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
-		unsigned int sqe_flags;
 
 		req = io_get_req(ctx, statep);
 		if (unlikely(!req)) {
@@ -4243,8 +4251,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		}
 
 		submitted++;
-		sqe_flags = sqe->flags;
-
 		req->ring_file = ring_file;
 		req->ring_fd = ring_fd;
 		req->has_user = *mm != NULL;
@@ -4253,14 +4259,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
 		if (!io_submit_sqe(req, sqe, statep, &link))
 			break;
-		/*
-		 * If previous wasn't linked and we have a linked command,
-		 * that's the end of the chain. Submit the previous link.
-		 */
-		if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
-			io_queue_link_head(link);
-			link = NULL;
-		}
 	}
 
 	if (link)
-- 
cgit 


From add7b6b85a4dfa89283834d181e87ea2144b9028 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 18 Dec 2019 08:54:21 -0700
Subject: io_uring: remove two unnecessary function declarations

__io_free_req() and io_double_put_req() aren't used before they are
defined, so we can kill these two forwards.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 165f77cfc6cb..b086a2dc47e2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -518,9 +518,7 @@ struct io_submit_state {
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
-static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req);
-static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
-- 
cgit 


From d3656344fea0339fb0365c8df4d2beba4e0089cd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 18 Dec 2019 09:50:26 -0700
Subject: io_uring: add lookup table for various opcode needs

We currently have various switch statements that check if an opcode needs
a file, mm, etc. These are hard to keep in sync as opcodes are added. Add
a struct io_op_def that holds all of this information, so we have just
one spot to update when opcodes are added.

This also enables us to NOT allocate req->io if a deferred command
doesn't need it, and corrects some mistakes we had in terms of what
commands need mm context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 208 +++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 155 insertions(+), 53 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b086a2dc47e2..244aaccbc82b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -516,6 +516,135 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+struct io_op_def {
+	/* needs req->io allocated for deferral/async */
+	unsigned		async_ctx : 1;
+	/* needs current->mm setup, does mm access */
+	unsigned		needs_mm : 1;
+	/* needs req->file assigned */
+	unsigned		needs_file : 1;
+	/* needs req->file assigned IFF fd is >= 0 */
+	unsigned		fd_non_neg : 1;
+	/* hash wq insertion if file is a regular file */
+	unsigned		hash_reg_file : 1;
+	/* unbound wq insertion if file is a non-regular file */
+	unsigned		unbound_nonreg_file : 1;
+};
+
+static const struct io_op_def io_op_defs[] = {
+	{
+		/* IORING_OP_NOP */
+	},
+	{
+		/* IORING_OP_READV */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_WRITEV */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_FSYNC */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_READ_FIXED */
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_WRITE_FIXED */
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_POLL_ADD */
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_POLL_REMOVE */
+	},
+	{
+		/* IORING_OP_SYNC_FILE_RANGE */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_SENDMSG */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_RECVMSG */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_TIMEOUT */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+	},
+	{
+		/* IORING_OP_TIMEOUT_REMOVE */
+	},
+	{
+		/* IORING_OP_ACCEPT */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_ASYNC_CANCEL */
+	},
+	{
+		/* IORING_OP_LINK_TIMEOUT */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+	},
+	{
+		/* IORING_OP_CONNECT */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_FALLOCATE */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_OPENAT */
+		.needs_file		= 1,
+		.fd_non_neg		= 1,
+	},
+	{
+		/* IORING_OP_CLOSE */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_FILES_UPDATE */
+		.needs_mm		= 1,
+	},
+	{
+		/* IORING_OP_STATX */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.fd_non_neg		= 1,
+	},
+};
+
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
@@ -671,41 +800,20 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static inline bool io_req_needs_user(struct io_kiocb *req)
-{
-	return !(req->opcode == IORING_OP_READ_FIXED ||
-		 req->opcode == IORING_OP_WRITE_FIXED);
-}
-
 static inline bool io_prep_async_work(struct io_kiocb *req,
 				      struct io_kiocb **link)
 {
+	const struct io_op_def *def = &io_op_defs[req->opcode];
 	bool do_hashed = false;
 
-	switch (req->opcode) {
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-		/* only regular files should be hashed for writes */
-		if (req->flags & REQ_F_ISREG)
+	if (req->flags & REQ_F_ISREG) {
+		if (def->hash_reg_file)
 			do_hashed = true;
-		/* fall-through */
-	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_SENDMSG:
-	case IORING_OP_RECVMSG:
-	case IORING_OP_ACCEPT:
-	case IORING_OP_POLL_ADD:
-	case IORING_OP_CONNECT:
-		/*
-		 * We know REQ_F_ISREG is not set on some of these
-		 * opcodes, but this enables us to keep the check in
-		 * just one place.
-		 */
-		if (!(req->flags & REQ_F_ISREG))
+	} else {
+		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
-		break;
 	}
-	if (io_req_needs_user(req))
+	if (def->needs_mm)
 		req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 
 	*link = io_prep_linked_timeout(req);
@@ -1826,6 +1934,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
 
 static int io_alloc_async_ctx(struct io_kiocb *req)
 {
+	if (!io_op_defs[req->opcode].async_ctx)
+		return 0;
 	req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
 	return req->io == NULL;
 }
@@ -3765,29 +3875,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		io_wq_assign_next(workptr, nxt);
 }
 
-static bool io_req_op_valid(int op)
-{
-	return op >= IORING_OP_NOP && op < IORING_OP_LAST;
-}
-
 static int io_req_needs_file(struct io_kiocb *req, int fd)
 {
-	switch (req->opcode) {
-	case IORING_OP_NOP:
-	case IORING_OP_POLL_REMOVE:
-	case IORING_OP_TIMEOUT:
-	case IORING_OP_TIMEOUT_REMOVE:
-	case IORING_OP_ASYNC_CANCEL:
-	case IORING_OP_LINK_TIMEOUT:
+	if (!io_op_defs[req->opcode].needs_file)
 		return 0;
-	case IORING_OP_OPENAT:
-	case IORING_OP_STATX:
-		return fd != -1;
-	default:
-		if (io_req_op_valid(req->opcode))
-			return 1;
-		return -EINVAL;
-	}
+	if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
+		return 0;
+	return 1;
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -3804,7 +3898,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags;
-	int fd, ret;
+	int fd;
 
 	flags = READ_ONCE(sqe->flags);
 	fd = READ_ONCE(sqe->fd);
@@ -3812,9 +3906,8 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 	if (flags & IOSQE_IO_DRAIN)
 		req->flags |= REQ_F_IO_DRAIN;
 
-	ret = io_req_needs_file(req, fd);
-	if (ret <= 0)
-		return ret;
+	if (!io_req_needs_file(req, fd))
+		return 0;
 
 	if (flags & IOSQE_FIXED_FILE) {
 		if (unlikely(!ctx->file_data ||
@@ -4240,7 +4333,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			break;
 		}
 
-		if (io_req_needs_user(req) && !*mm) {
+		/* will complete beyond this point, count as submitted */
+		submitted++;
+
+		if (unlikely(req->opcode >= IORING_OP_LAST)) {
+			io_cqring_add_event(req, -EINVAL);
+			io_double_put_req(req);
+			break;
+		}
+
+		if (io_op_defs[req->opcode].needs_mm && !*mm) {
 			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
 			if (!mm_fault) {
 				use_mm(ctx->sqo_mm);
@@ -4248,7 +4350,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
-		submitted++;
 		req->ring_file = ring_file;
 		req->ring_fd = ring_fd;
 		req->has_user = *mm != NULL;
@@ -6104,6 +6205,7 @@ out_fput:
 
 static int __init io_uring_init(void)
 {
+	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 	return 0;
 };
-- 
cgit 


From ad3eb2c89fb24d14ac81f43eff8e85fece2c934d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 18 Dec 2019 17:12:20 -0700
Subject: io_uring: split overflow state into SQ and CQ side

We currently check ->cq_overflow_list from both SQ and CQ context, which
causes some bouncing of that cache line. Add separate bits of state for
this instead, so that the SQ side can check using its own state, and
likewise for the CQ side.

This adds ->sq_check_overflow with the SQ state, and ->cq_check_overflow
with the CQ state. If we hit an overflow condition, both of these bits
are set. Likewise for overflow flush clear, we clear both bits. For the
fast path of just checking if there's an overflow condition on either
the SQ or CQ side, we can use our own private bit for this.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 244aaccbc82b..dfa99da8b2d1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -224,13 +224,14 @@ struct io_ring_ctx {
 		unsigned		sq_thread_idle;
 		unsigned		cached_sq_dropped;
 		atomic_t		cached_cq_overflow;
-		struct io_uring_sqe	*sq_sqes;
+		unsigned long		sq_check_overflow;
 
 		struct list_head	defer_list;
 		struct list_head	timeout_list;
 		struct list_head	cq_overflow_list;
 
 		wait_queue_head_t	inflight_wait;
+		struct io_uring_sqe	*sq_sqes;
 	} ____cacheline_aligned_in_smp;
 
 	struct io_rings	*rings;
@@ -272,6 +273,7 @@ struct io_ring_ctx {
 		unsigned		cq_entries;
 		unsigned		cq_mask;
 		atomic_t		cq_timeouts;
+		unsigned long		cq_check_overflow;
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
 		struct eventfd_ctx	*cq_ev_fd;
@@ -950,6 +952,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	}
 
 	io_commit_cqring(ctx);
+	if (cqe) {
+		clear_bit(0, &ctx->sq_check_overflow);
+		clear_bit(0, &ctx->cq_check_overflow);
+	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
 
@@ -983,6 +989,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
 	} else {
+		if (list_empty(&ctx->cq_overflow_list)) {
+			set_bit(0, &ctx->sq_check_overflow);
+			set_bit(0, &ctx->cq_check_overflow);
+		}
 		refcount_inc(&req->refs);
 		req->result = res;
 		list_add_tail(&req->list, &ctx->cq_overflow_list);
@@ -1285,19 +1295,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 {
 	struct io_rings *rings = ctx->rings;
 
-	/*
-	 * noflush == true is from the waitqueue handler, just ensure we wake
-	 * up the task, and the next invocation will flush the entries. We
-	 * cannot safely to it from here.
-	 */
-	if (noflush && !list_empty(&ctx->cq_overflow_list))
-		return -1U;
+	if (test_bit(0, &ctx->cq_check_overflow)) {
+		/*
+		 * noflush == true is from the waitqueue handler, just ensure
+		 * we wake up the task, and the next invocation will flush the
+		 * entries. We cannot safely to it from here.
+		 */
+		if (noflush && !list_empty(&ctx->cq_overflow_list))
+			return -1U;
 
-	io_cqring_overflow_flush(ctx, false);
+		io_cqring_overflow_flush(ctx, false);
+	}
 
 	/* See comment at the top of this file */
 	smp_rmb();
-	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+	return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
 }
 
 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
@@ -4309,9 +4321,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	bool mm_fault = false;
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
-	if (!list_empty(&ctx->cq_overflow_list) &&
-	    !io_cqring_overflow_flush(ctx, false))
-		return -EBUSY;
+	if (test_bit(0, &ctx->sq_check_overflow)) {
+		if (!list_empty(&ctx->cq_overflow_list) &&
+		    !io_cqring_overflow_flush(ctx, false))
+			return -EBUSY;
+	}
 
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, nr);
-- 
cgit 


From e94f141bd248ebdadcb7351f1e70b31cee5add53 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 19 Dec 2019 12:06:02 -0700
Subject: io_uring: improve poll completion performance

For busy IORING_OP_POLL_ADD workloads, we can have enough contention
on the completion lock that we fail the inline completion path quite
often as we fail the trylock on that lock. Add a list for deferred
completions that we can use in that case. This helps reduce the number
of async offloads we have to do, as if we get multiple completions in
a row, we'll piggy back on to the poll_llist instead of having to queue
our own offload.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 20 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dfa99da8b2d1..c54a8bd37b54 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -286,7 +286,8 @@ struct io_ring_ctx {
 
 	struct {
 		spinlock_t		completion_lock;
-		bool			poll_multi_file;
+		struct llist_head	poll_llist;
+
 		/*
 		 * ->poll_list is protected by the ctx->uring_lock for
 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -296,6 +297,7 @@ struct io_ring_ctx {
 		struct list_head	poll_list;
 		struct hlist_head	*cancel_hash;
 		unsigned		cancel_hash_bits;
+		bool			poll_multi_file;
 
 		spinlock_t		inflight_lock;
 		struct list_head	inflight_list;
@@ -453,7 +455,14 @@ struct io_kiocb {
 	};
 
 	struct io_async_ctx		*io;
-	struct file			*ring_file;
+	union {
+		/*
+		 * ring_file is only used in the submission path, and
+		 * llist_node is only used for poll deferred completions
+		 */
+		struct file		*ring_file;
+		struct llist_node	llist_node;
+	};
 	int				ring_fd;
 	bool				has_user;
 	bool				in_async;
@@ -725,6 +734,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
+	init_llist_head(&ctx->poll_llist);
 	INIT_LIST_HEAD(&ctx->poll_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
@@ -1320,6 +1330,20 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
+static inline bool io_req_multi_free(struct io_kiocb *req)
+{
+	/*
+	 * If we're not using fixed files, we have to pair the completion part
+	 * with the file put. Use regular completions for those, only batch
+	 * free for fixed file and non-linked commands.
+	 */
+	if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE)
+	    && !io_is_fallback_req(req) && !req->io)
+		return true;
+
+	return false;
+}
+
 /*
  * Find and free completed poll iocbs
  */
@@ -1339,14 +1363,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs)) {
-			/* If we're not using fixed files, we have to pair the
-			 * completion part with the file put. Use regular
-			 * completions for those, only batch free for fixed
-			 * file and non-linked commands.
-			 */
-			if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
-			    REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
-			    !req->io) {
+			if (io_req_multi_free(req)) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
@@ -3081,6 +3098,44 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 		io_wq_assign_next(workptr, nxt);
 }
 
+static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
+{
+	void *reqs[IO_IOPOLL_BATCH];
+	struct io_kiocb *req, *tmp;
+	int to_free = 0;
+
+	spin_lock_irq(&ctx->completion_lock);
+	llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
+		hash_del(&req->hash_node);
+		io_poll_complete(req, req->result, 0);
+
+		if (refcount_dec_and_test(&req->refs)) {
+			if (io_req_multi_free(req)) {
+				reqs[to_free++] = req;
+				if (to_free == ARRAY_SIZE(reqs))
+					io_free_req_many(ctx, reqs, &to_free);
+			} else {
+				req->flags |= REQ_F_COMP_LOCKED;
+				io_free_req(req);
+			}
+		}
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	io_free_req_many(ctx, reqs, &to_free);
+}
+
+static void io_poll_flush(struct io_wq_work **workptr)
+{
+	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+	struct llist_node *nodes;
+
+	nodes = llist_del_all(&req->ctx->poll_llist);
+	if (nodes)
+		__io_poll_flush(req->ctx, nodes);
+}
+
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
@@ -3088,7 +3143,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
 	struct io_ring_ctx *ctx = req->ctx;
 	__poll_t mask = key_to_poll(key);
-	unsigned long flags;
 
 	/* for instances that support it check for an event match first: */
 	if (mask && !(mask & poll->events))
@@ -3102,17 +3156,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	 * If we have a link timeout we're going to need the completion_lock
 	 * for finalizing the request, mark us as having grabbed that already.
 	 */
-	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-		hash_del(&req->hash_node);
-		io_poll_complete(req, mask, 0);
-		req->flags |= REQ_F_COMP_LOCKED;
-		io_put_req(req);
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	if (mask) {
+		unsigned long flags;
 
-		io_cqring_ev_posted(ctx);
-	} else {
-		io_queue_async_work(req);
+		if (llist_empty(&ctx->poll_llist) &&
+		    spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+			hash_del(&req->hash_node);
+			io_poll_complete(req, mask, 0);
+			req->flags |= REQ_F_COMP_LOCKED;
+			io_put_req(req);
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+			io_cqring_ev_posted(ctx);
+			req = NULL;
+		} else {
+			req->result = mask;
+			req->llist_node.next = NULL;
+			/* if the list wasn't empty, we're done */
+			if (!llist_add(&req->llist_node, &ctx->poll_llist))
+				req = NULL;
+			else
+				req->work.func = io_poll_flush;
+		}
 	}
+	if (req)
+		io_queue_async_work(req);
 
 	return 1;
 }
-- 
cgit 


From 3a6820f2bb8a079975109c25a5d1f29f46bce5d2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 22 Dec 2019 15:19:35 -0700
Subject: io_uring: add non-vectored read/write commands

For uses cases that don't already naturally have an iovec, it's easier
(or more convenient) to just use a buffer address + length. This is
particular true if the use case is from languages that want to create
a memory safe abstraction on top of io_uring, and where introducing
the need for the iovec may impose an ownership issue. For those cases,
they currently need an indirection buffer, which means allocating data
just for this purpose.

Add basic read/write that don't require the iovec.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c54a8bd37b54..407ba3388e14 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -654,6 +654,18 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
 	},
+	{
+		/* IORING_OP_READ */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_WRITE */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -1867,6 +1879,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	if (req->rw.kiocb.private)
 		return -EINVAL;
 
+	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
+		ssize_t ret;
+		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
+		*iovec = NULL;
+		return ret;
+	}
+
 	if (req->io) {
 		struct io_async_rw *iorw = &req->io->rw;
 
@@ -3634,10 +3653,12 @@ static int io_req_defer_prep(struct io_kiocb *req,
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
+	case IORING_OP_READ:
 		ret = io_read_prep(req, sqe, true);
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
+	case IORING_OP_WRITE:
 		ret = io_write_prep(req, sqe, true);
 		break;
 	case IORING_OP_POLL_ADD:
@@ -3741,6 +3762,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
+	case IORING_OP_READ:
 		if (sqe) {
 			ret = io_read_prep(req, sqe, force_nonblock);
 			if (ret < 0)
@@ -3750,6 +3772,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
+	case IORING_OP_WRITE:
 		if (sqe) {
 			ret = io_write_prep(req, sqe, force_nonblock);
 			if (ret < 0)
-- 
cgit 


From ba04291eb66ed895f194ae5abd3748d72bf8aaea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Dec 2019 16:33:42 -0700
Subject: io_uring: allow use of offset == -1 to mean file position
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This behaves like preadv2/pwritev2 with offset == -1, it'll use (and
update) the current file position. This obviously comes with the caveat
that if the application has multiple read/writes in flight, then the
end result will not be as expected. This is similar to threads sharing
a file descriptor and doing IO using the current file position.

Since this feature isn't easily detectable by doing a read or write,
add a feature flags, IORING_FEAT_RW_CUR_POS, to allow applications to
detect presence of this feature.

Reported-by: 李通洲 <carter.li@eoitek.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 407ba3388e14..5286620e4e46 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -495,6 +495,7 @@ struct io_kiocb {
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
 #define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
 #define REQ_F_FORCE_ASYNC	131072	/* IOSQE_ASYNC */
+#define REQ_F_CUR_POS		262144	/* read/write uses file position */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -1711,6 +1712,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
+	if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+		req->flags |= REQ_F_CUR_POS;
+		kiocb->ki_pos = req->file->f_pos;
+	}
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 
@@ -1782,6 +1787,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
 		       bool in_async)
 {
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+
+	if (req->flags & REQ_F_CUR_POS)
+		req->file->f_pos = kiocb->ki_pos;
 	if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
 		*nxt = __io_complete_rw(kiocb, ret);
 	else
@@ -6154,7 +6163,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		goto err;
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
-			IORING_FEAT_SUBMIT_STABLE;
+			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 	return ret;
 err:
-- 
cgit 


From 4840e418c2fc533d55ff6caa5b9313eed1d26cfd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Dec 2019 22:03:45 -0700
Subject: io_uring: add IORING_OP_FADVISE

This adds support for doing fadvise through io_uring. We assume that
WILLNEED doesn't block, but that DONTNEED may block.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5286620e4e46..9ca12b900b42 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -72,6 +72,7 @@
 #include <linux/highmem.h>
 #include <linux/namei.h>
 #include <linux/fsnotify.h>
+#include <linux/fadvise.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -400,6 +401,13 @@ struct io_files_update {
 	u32				offset;
 };
 
+struct io_fadvise {
+	struct file			*file;
+	u64				offset;
+	u32				len;
+	u32				advice;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -452,6 +460,7 @@ struct io_kiocb {
 		struct io_open		open;
 		struct io_close		close;
 		struct io_files_update	files_update;
+		struct io_fadvise	fadvise;
 	};
 
 	struct io_async_ctx		*io;
@@ -667,6 +676,10 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
+	{
+		/* IORING_OP_FADVISE */
+		.needs_file		= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2436,6 +2449,35 @@ err:
 	return 0;
 }
 
+static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	if (sqe->ioprio || sqe->buf_index || sqe->addr)
+		return -EINVAL;
+
+	req->fadvise.offset = READ_ONCE(sqe->off);
+	req->fadvise.len = READ_ONCE(sqe->len);
+	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
+	return 0;
+}
+
+static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
+{
+	struct io_fadvise *fa = &req->fadvise;
+	int ret;
+
+	/* DONTNEED may block, others _should_ not */
+	if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
+		return -EAGAIN;
+
+	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+}
+
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	unsigned lookup_flags;
@@ -3721,6 +3763,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_STATX:
 		ret = io_statx_prep(req, sqe);
 		break;
+	case IORING_OP_FADVISE:
+		ret = io_fadvise_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3917,6 +3962,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_statx(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_FADVISE:
+		if (sqe) {
+			ret = io_fadvise_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_fadvise(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit 


From c1ca757bd6f4632c510714631ddcc2d13030fe1e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Dec 2019 22:18:28 -0700
Subject: io_uring: add IORING_OP_MADVISE

This adds support for doing madvise(2) through io_uring. We assume that
any operation can block, and hence punt everything async. This could be
improved, but hard to make bullet proof. The async punt ensures it's
safe.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9ca12b900b42..17a199c99c7c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -408,6 +408,13 @@ struct io_fadvise {
 	u32				advice;
 };
 
+struct io_madvise {
+	struct file			*file;
+	u64				addr;
+	u32				len;
+	u32				advice;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -461,6 +468,7 @@ struct io_kiocb {
 		struct io_close		close;
 		struct io_files_update	files_update;
 		struct io_fadvise	fadvise;
+		struct io_madvise	madvise;
 	};
 
 	struct io_async_ctx		*io;
@@ -680,6 +688,10 @@ static const struct io_op_def io_op_defs[] = {
 		/* IORING_OP_FADVISE */
 		.needs_file		= 1,
 	},
+	{
+		/* IORING_OP_MADVISE */
+		.needs_mm		= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2449,6 +2461,42 @@ err:
 	return 0;
 }
 
+static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+	if (sqe->ioprio || sqe->buf_index || sqe->off)
+		return -EINVAL;
+
+	req->madvise.addr = READ_ONCE(sqe->addr);
+	req->madvise.len = READ_ONCE(sqe->len);
+	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+	struct io_madvise *ma = &req->madvise;
+	int ret;
+
+	if (force_nonblock)
+		return -EAGAIN;
+
+	ret = do_madvise(ma->addr, ma->len, ma->advice);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	if (sqe->ioprio || sqe->buf_index || sqe->addr)
@@ -3766,6 +3814,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_FADVISE:
 		ret = io_fadvise_prep(req, sqe);
 		break;
+	case IORING_OP_MADVISE:
+		ret = io_madvise_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3970,6 +4021,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_fadvise(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_MADVISE:
+		if (sqe) {
+			ret = io_madvise_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_madvise(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit 


From 2b85edfc0c90efc68dea3d665bb4111bf0694e05 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 28 Dec 2019 14:13:03 +0300
Subject: io_uring: batch getting pcpu references

percpu_ref_tryget() has its own overhead. Instead getting a reference
for each request, grab a bunch once per io_submit_sqes().

~5% throughput boost for a "submit and wait 128 nops" benchmark.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>

__io_req_free_empty() -> __io_req_do_free()

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17a199c99c7c..9b869fb6c635 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1083,9 +1083,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct io_kiocb *req;
 
-	if (!percpu_ref_tryget(&ctx->refs))
-		return NULL;
-
 	if (!state) {
 		req = kmem_cache_alloc(req_cachep, gfp);
 		if (unlikely(!req))
@@ -1145,6 +1142,14 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 	}
 }
 
+static void __io_req_do_free(struct io_kiocb *req)
+{
+	if (likely(!io_is_fallback_req(req)))
+		kmem_cache_free(req_cachep, req);
+	else
+		clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
+}
+
 static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1166,11 +1171,9 @@ static void __io_free_req(struct io_kiocb *req)
 			wake_up(&ctx->inflight_wait);
 		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 	}
-	percpu_ref_put(&ctx->refs);
-	if (likely(!io_is_fallback_req(req)))
-		kmem_cache_free(req_cachep, req);
-	else
-		clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
+
+	percpu_ref_put(&req->ctx->refs);
+	__io_req_do_free(req);
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -4539,6 +4542,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			return -EBUSY;
 	}
 
+	if (!percpu_ref_tryget_many(&ctx->refs, nr))
+		return -EAGAIN;
+
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, nr);
 		statep = &state;
@@ -4555,7 +4561,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			break;
 		}
 		if (!io_get_sqring(ctx, req, &sqe)) {
-			__io_free_req(req);
+			__io_req_do_free(req);
 			break;
 		}
 
@@ -4586,6 +4592,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			break;
 	}
 
+	if (submitted != nr)
+		percpu_ref_put_many(&ctx->refs, nr - submitted);
 	if (link)
 		io_queue_link_head(link);
 	if (statep)
-- 
cgit 


From 8237e045983d82ba78eaab5f60b9300927fc6796 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 28 Dec 2019 10:48:22 -0700
Subject: io_uring: wrap multi-req freeing in struct req_batch

This cleans up the code a bit, and it allows us to build on top of the
multi-req freeing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 65 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9b869fb6c635..0d02987abf40 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1132,14 +1132,19 @@ fallback:
 	return NULL;
 }
 
-static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+struct req_batch {
+	void *reqs[IO_IOPOLL_BATCH];
+	int to_free;
+};
+
+static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
-	if (*nr) {
-		kmem_cache_free_bulk(req_cachep, *nr, reqs);
-		percpu_ref_put_many(&ctx->refs, *nr);
-		percpu_ref_put_many(&ctx->file_data->refs, *nr);
-		*nr = 0;
-	}
+	if (!rb->to_free)
+		return;
+	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	percpu_ref_put_many(&ctx->refs, rb->to_free);
+	percpu_ref_put_many(&ctx->file_data->refs, rb->to_free);
+	rb->to_free = 0;
 }
 
 static void __io_req_do_free(struct io_kiocb *req)
@@ -1371,7 +1376,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
-static inline bool io_req_multi_free(struct io_kiocb *req)
+static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 {
 	/*
 	 * If we're not using fixed files, we have to pair the completion part
@@ -1379,8 +1384,12 @@ static inline bool io_req_multi_free(struct io_kiocb *req)
 	 * free for fixed file and non-linked commands.
 	 */
 	if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE)
-	    && !io_is_fallback_req(req) && !req->io)
+	    && !io_is_fallback_req(req) && !req->io) {
+		rb->reqs[rb->to_free++] = req;
+		if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+			io_free_req_many(req->ctx, rb);
 		return true;
+	}
 
 	return false;
 }
@@ -1391,11 +1400,10 @@ static inline bool io_req_multi_free(struct io_kiocb *req)
 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			       struct list_head *done)
 {
-	void *reqs[IO_IOPOLL_BATCH];
+	struct req_batch rb;
 	struct io_kiocb *req;
-	int to_free;
 
-	to_free = 0;
+	rb.to_free = 0;
 	while (!list_empty(done)) {
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
@@ -1403,19 +1411,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		io_cqring_fill_event(req, req->result);
 		(*nr_events)++;
 
-		if (refcount_dec_and_test(&req->refs)) {
-			if (io_req_multi_free(req)) {
-				reqs[to_free++] = req;
-				if (to_free == ARRAY_SIZE(reqs))
-					io_free_req_many(ctx, reqs, &to_free);
-			} else {
-				io_free_req(req);
-			}
-		}
+		if (refcount_dec_and_test(&req->refs) &&
+		    !io_req_multi_free(&rb, req))
+			io_free_req(req);
 	}
 
 	io_commit_cqring(ctx);
-	io_free_req_many(ctx, reqs, &to_free);
+	io_free_req_many(ctx, &rb);
 }
 
 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -3221,30 +3223,25 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 
 static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
 {
-	void *reqs[IO_IOPOLL_BATCH];
 	struct io_kiocb *req, *tmp;
-	int to_free = 0;
+	struct req_batch rb;
 
+	rb.to_free = 0;
 	spin_lock_irq(&ctx->completion_lock);
 	llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
 		hash_del(&req->hash_node);
 		io_poll_complete(req, req->result, 0);
 
-		if (refcount_dec_and_test(&req->refs)) {
-			if (io_req_multi_free(req)) {
-				reqs[to_free++] = req;
-				if (to_free == ARRAY_SIZE(reqs))
-					io_free_req_many(ctx, reqs, &to_free);
-			} else {
-				req->flags |= REQ_F_COMP_LOCKED;
-				io_free_req(req);
-			}
+		if (refcount_dec_and_test(&req->refs) &&
+		    !io_req_multi_free(&rb, req)) {
+			req->flags |= REQ_F_COMP_LOCKED;
+			io_free_req(req);
 		}
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
-	io_free_req_many(ctx, reqs, &to_free);
+	io_free_req_many(ctx, &rb);
 }
 
 static void io_poll_flush(struct io_wq_work **workptr)
-- 
cgit 


From c6ca97b30c47c7ad36107d3764bb4dc37026d171 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 28 Dec 2019 12:11:08 -0700
Subject: io_uring: extend batch freeing to cover more cases

Currently we only batch free if fixed files are used, no links, no aux
data, etc. This extends the batch freeing to only exclude the linked
case and fallback case, and make io_free_req_many() handle the other
cases just fine.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 100 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 31 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0d02987abf40..1fbe3eb78d08 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1132,21 +1132,6 @@ fallback:
 	return NULL;
 }
 
-struct req_batch {
-	void *reqs[IO_IOPOLL_BATCH];
-	int to_free;
-};
-
-static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
-{
-	if (!rb->to_free)
-		return;
-	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
-	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	percpu_ref_put_many(&ctx->file_data->refs, rb->to_free);
-	rb->to_free = 0;
-}
-
 static void __io_req_do_free(struct io_kiocb *req)
 {
 	if (likely(!io_is_fallback_req(req)))
@@ -1155,7 +1140,7 @@ static void __io_req_do_free(struct io_kiocb *req)
 		clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
 }
 
-static void __io_free_req(struct io_kiocb *req)
+static void __io_req_aux_free(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -1167,7 +1152,14 @@ static void __io_free_req(struct io_kiocb *req)
 		else
 			fput(req->file);
 	}
+}
+
+static void __io_free_req(struct io_kiocb *req)
+{
+	__io_req_aux_free(req);
+
 	if (req->flags & REQ_F_INFLIGHT) {
+		struct io_ring_ctx *ctx = req->ctx;
 		unsigned long flags;
 
 		spin_lock_irqsave(&ctx->inflight_lock, flags);
@@ -1181,6 +1173,56 @@ static void __io_free_req(struct io_kiocb *req)
 	__io_req_do_free(req);
 }
 
+struct req_batch {
+	void *reqs[IO_IOPOLL_BATCH];
+	int to_free;
+	int need_iter;
+};
+
+static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
+{
+	if (!rb->to_free)
+		return;
+	if (rb->need_iter) {
+		int i, inflight = 0;
+		unsigned long flags;
+
+		for (i = 0; i < rb->to_free; i++) {
+			struct io_kiocb *req = rb->reqs[i];
+
+			if (req->flags & REQ_F_FIXED_FILE)
+				req->file = NULL;
+			if (req->flags & REQ_F_INFLIGHT)
+				inflight++;
+			else
+				rb->reqs[i] = NULL;
+			__io_req_aux_free(req);
+		}
+		if (!inflight)
+			goto do_free;
+
+		spin_lock_irqsave(&ctx->inflight_lock, flags);
+		for (i = 0; i < rb->to_free; i++) {
+			struct io_kiocb *req = rb->reqs[i];
+
+			if (req) {
+				list_del(&req->inflight_entry);
+				if (!--inflight)
+					break;
+			}
+		}
+		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+
+		if (waitqueue_active(&ctx->inflight_wait))
+			wake_up(&ctx->inflight_wait);
+	}
+do_free:
+	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	percpu_ref_put_many(&ctx->refs, rb->to_free);
+	percpu_ref_put_many(&ctx->file_data->refs, rb->to_free);
+	rb->to_free = rb->need_iter = 0;
+}
+
 static bool io_link_cancel_timeout(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1378,20 +1420,16 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 
 static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 {
-	/*
-	 * If we're not using fixed files, we have to pair the completion part
-	 * with the file put. Use regular completions for those, only batch
-	 * free for fixed file and non-linked commands.
-	 */
-	if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE)
-	    && !io_is_fallback_req(req) && !req->io) {
-		rb->reqs[rb->to_free++] = req;
-		if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-			io_free_req_many(req->ctx, rb);
-		return true;
-	}
+	if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
+		return false;
 
-	return false;
+	if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
+		rb->need_iter++;
+
+	rb->reqs[rb->to_free++] = req;
+	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+		io_free_req_many(req->ctx, rb);
+	return true;
 }
 
 /*
@@ -1403,7 +1441,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	struct req_batch rb;
 	struct io_kiocb *req;
 
-	rb.to_free = 0;
+	rb.to_free = rb.need_iter = 0;
 	while (!list_empty(done)) {
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
@@ -3226,7 +3264,7 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
 	struct io_kiocb *req, *tmp;
 	struct req_batch rb;
 
-	rb.to_free = 0;
+	rb.to_free = rb.need_iter = 0;
 	spin_lock_irq(&ctx->completion_lock);
 	llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
 		hash_del(&req->hash_node);
-- 
cgit 


From 8110c1a6212e430a84edd2b83fe9043def8b743e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 28 Dec 2019 15:39:54 -0700
Subject: io_uring: add support for IORING_SETUP_CLAMP

Some applications like to start small in terms of ring size, and then
ramp up as needed. This is a bit tricky to do currently, since we don't
advertise the max ring size.

This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring
size exceed what we support, then clamp them at the max values instead
of returning -EINVAL. Since we return the chosen ring sizes after setup,
no further changes are needed on the application side. io_uring already
changes the ring sizes if the application doesn't ask for power-of-two
sizes, for example.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1fbe3eb78d08..7c44b0ef10d7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6234,8 +6234,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	bool account_mem;
 	int ret;
 
-	if (!entries || entries > IORING_MAX_ENTRIES)
+	if (!entries)
 		return -EINVAL;
+	if (entries > IORING_MAX_ENTRIES) {
+		if (!(p->flags & IORING_SETUP_CLAMP))
+			return -EINVAL;
+		entries = IORING_MAX_ENTRIES;
+	}
 
 	/*
 	 * Use twice as many entries for the CQ ring. It's possible for the
@@ -6252,8 +6257,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		 * to a power-of-two, if it isn't already. We do NOT impose
 		 * any cq vs sq ring sizing.
 		 */
-		if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
+		if (p->cq_entries < p->sq_entries)
 			return -EINVAL;
+		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
+			if (!(p->flags & IORING_SETUP_CLAMP))
+				return -EINVAL;
+			p->cq_entries = IORING_MAX_CQ_ENTRIES;
+		}
 		p->cq_entries = roundup_pow_of_two(p->cq_entries);
 	} else {
 		p->cq_entries = 2 * p->sq_entries;
@@ -6345,7 +6355,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	}
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
-			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
+			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
+			IORING_SETUP_CLAMP))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
-- 
cgit 


From 9ef4f124894b7b9241a3cf5f9b40db0812783d66 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Dec 2019 21:24:44 +0300
Subject: io_uring: clamp to_submit in io_submit_sqes()

Make io_submit_sqes() to clamp @to_submit itself. It removes duplicated
code and prepares for following changes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7c44b0ef10d7..497ed610e8b2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4577,6 +4577,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			return -EBUSY;
 	}
 
+	nr = min(nr, ctx->sq_entries);
+
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
@@ -4751,7 +4753,6 @@ static int io_sq_thread(void *data)
 			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 		}
 
-		to_submit = min(to_submit, ctx->sq_entries);
 		mutex_lock(&ctx->uring_lock);
 		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
 		mutex_unlock(&ctx->uring_lock);
@@ -6100,7 +6101,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out;
 		}
 
-		to_submit = min(to_submit, ctx->sq_entries);
 		mutex_lock(&ctx->uring_lock);
 		/* already have mm, so io_submit_sqes() won't try to grab it */
 		cur_mm = ctx->sqo_mm;
-- 
cgit 


From ee7d46d9db19ded7b7222af95add63606318a480 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Dec 2019 21:24:45 +0300
Subject: io_uring: optimise head checks in io_get_sqring()

A user may ask to submit more than there is in the ring, and then
io_uring will submit as much as it can. However, in the last iteration
it will allocate an io_kiocb and immediately free it. It could do
better and adjust @to_submit to what is in the ring.

And since the ring's head is already checked here, there is no need to
do it in the loop, spamming with smp_load_acquire()'s barriers

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 497ed610e8b2..3398f4052ec0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4522,7 +4522,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
 static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			  const struct io_uring_sqe **sqe_ptr)
 {
-	struct io_rings *rings = ctx->rings;
 	u32 *sq_array = ctx->sq_array;
 	unsigned head;
 
@@ -4534,12 +4533,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * 2) allows the kernel side to track the head on its own, even
 	 *    though the application is the one updating it.
 	 */
-	head = ctx->cached_sq_head;
-	/* make sure SQ entry isn't read before tail */
-	if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
-		return false;
-
-	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
+	head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
 	if (likely(head < ctx->sq_entries)) {
 		/*
 		 * All io need record the previous position, if LINK vs DARIN,
@@ -4557,7 +4551,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	/* drop invalid entries */
 	ctx->cached_sq_head++;
 	ctx->cached_sq_dropped++;
-	WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
+	WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
 	return false;
 }
 
@@ -4577,7 +4571,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			return -EBUSY;
 	}
 
-	nr = min(nr, ctx->sq_entries);
+	/* make sure SQ entry isn't read before tail */
+	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
 
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
-- 
cgit 


From caf582c652feccd42c50923f0467c4f2dcef279e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Dec 2019 21:24:46 +0300
Subject: io_uring: optimise commit_sqring() for common case

It should be pretty rare to not submitting anything when there is
something in the ring. No need to keep heuristics for this case.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3398f4052ec0..8613eae31f4c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4501,14 +4501,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 
-	if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
-		/*
-		 * Ensure any loads from the SQEs are done at this point,
-		 * since once we write the new head, the application could
-		 * write new data to them.
-		 */
-		smp_store_release(&rings->sq.head, ctx->cached_sq_head);
-	}
+	/*
+	 * Ensure any loads from the SQEs are done at this point,
+	 * since once we write the new head, the application could
+	 * write new data to them.
+	 */
+	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
 }
 
 /*
-- 
cgit 


From 2550878f8421f7912fdd56b38c630b797f95c749 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Dec 2019 21:24:47 +0300
Subject: io_uring: remove extra io_wq_current_is_worker()

io_wq workers use io_issue_sqe() to forward sqes and never
io_queue_sqe(). Remove extra check for io_wq_current_is_worker()

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8613eae31f4c..0d083811ccb9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4371,8 +4371,7 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			req_set_fail_links(req);
 			io_double_put_req(req);
 		}
-	} else if ((req->flags & REQ_F_FORCE_ASYNC) &&
-		   !io_wq_current_is_worker()) {
+	} else if (req->flags & REQ_F_FORCE_ASYNC) {
 		/*
 		 * Never try inline submit of IOSQE_ASYNC is set, go straight
 		 * to async execution.
-- 
cgit 


From fddafacee287b3140212c92464077e971401f860 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 4 Jan 2020 20:19:44 -0700
Subject: io_uring: add support for send(2) and recv(2)

This adds IORING_OP_SEND for send(2) support, and IORING_OP_RECV for
recv(2) support.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0d083811ccb9..edf072b6cb8f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -377,8 +377,12 @@ struct io_connect {
 
 struct io_sr_msg {
 	struct file			*file;
-	struct user_msghdr __user	*msg;
+	union {
+		struct user_msghdr __user *msg;
+		void __user		*buf;
+	};
 	int				msg_flags;
+	size_t				len;
 };
 
 struct io_open {
@@ -692,6 +696,18 @@ static const struct io_op_def io_op_defs[] = {
 		/* IORING_OP_MADVISE */
 		.needs_mm		= 1,
 	},
+	{
+		/* IORING_OP_SEND */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_RECV */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2802,8 +2818,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->len = READ_ONCE(sqe->len);
 
-	if (!io)
+	if (!io || req->opcode == IORING_OP_SEND)
 		return 0;
 
 	io->msg.iov = io->msg.fast_iov;
@@ -2883,6 +2900,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
 #endif
 }
 
+static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
+		   bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct socket *sock;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	sock = sock_from_file(req->file, &ret);
+	if (sock) {
+		struct io_sr_msg *sr = &req->sr_msg;
+		struct msghdr msg;
+		struct iovec iov;
+		unsigned flags;
+
+		ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
+						&msg.msg_iter);
+		if (ret)
+			return ret;
+
+		msg.msg_name = NULL;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_namelen = 0;
+
+		flags = req->sr_msg.msg_flags;
+		if (flags & MSG_DONTWAIT)
+			req->flags |= REQ_F_NOWAIT;
+		else if (force_nonblock)
+			flags |= MSG_DONTWAIT;
+
+		ret = __sys_sendmsg_sock(sock, &msg, flags);
+		if (force_nonblock && ret == -EAGAIN)
+			return -EAGAIN;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+	}
+
+	io_cqring_add_event(req, ret);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static int io_recvmsg_prep(struct io_kiocb *req,
 			   const struct io_uring_sqe *sqe)
 {
@@ -2893,7 +2960,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 
-	if (!io)
+	if (!io || req->opcode == IORING_OP_RECV)
 		return 0;
 
 	io->msg.iov = io->msg.fast_iov;
@@ -2975,6 +3042,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
 #endif
 }
 
+static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
+		   bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct socket *sock;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	sock = sock_from_file(req->file, &ret);
+	if (sock) {
+		struct io_sr_msg *sr = &req->sr_msg;
+		struct msghdr msg;
+		struct iovec iov;
+		unsigned flags;
+
+		ret = import_single_range(READ, sr->buf, sr->len, &iov,
+						&msg.msg_iter);
+		if (ret)
+			return ret;
+
+		msg.msg_name = NULL;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_namelen = 0;
+		msg.msg_iocb = NULL;
+		msg.msg_flags = 0;
+
+		flags = req->sr_msg.msg_flags;
+		if (flags & MSG_DONTWAIT)
+			req->flags |= REQ_F_NOWAIT;
+		else if (force_nonblock)
+			flags |= MSG_DONTWAIT;
+
+		ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
+		if (force_nonblock && ret == -EAGAIN)
+			return -EAGAIN;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+	}
+
+	io_cqring_add_event(req, ret);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+
 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_NET)
@@ -3811,9 +3931,11 @@ static int io_req_defer_prep(struct io_kiocb *req,
 		ret = io_prep_sfr(req, sqe);
 		break;
 	case IORING_OP_SENDMSG:
+	case IORING_OP_SEND:
 		ret = io_sendmsg_prep(req, sqe);
 		break;
 	case IORING_OP_RECVMSG:
+	case IORING_OP_RECV:
 		ret = io_recvmsg_prep(req, sqe);
 		break;
 	case IORING_OP_CONNECT:
@@ -3956,20 +4078,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		ret = io_sync_file_range(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_SENDMSG:
+	case IORING_OP_SEND:
 		if (sqe) {
 			ret = io_sendmsg_prep(req, sqe);
 			if (ret < 0)
 				break;
 		}
-		ret = io_sendmsg(req, nxt, force_nonblock);
+		if (req->opcode == IORING_OP_SENDMSG)
+			ret = io_sendmsg(req, nxt, force_nonblock);
+		else
+			ret = io_send(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_RECVMSG:
+	case IORING_OP_RECV:
 		if (sqe) {
 			ret = io_recvmsg_prep(req, sqe);
 			if (ret)
 				break;
 		}
-		ret = io_recvmsg(req, nxt, force_nonblock);
+		if (req->opcode == IORING_OP_RECVMSG)
+			ret = io_recvmsg(req, nxt, force_nonblock);
+		else
+			ret = io_recv(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_TIMEOUT:
 		if (sqe) {
-- 
cgit 


From 96fd84d83a778450ffae737d9efa546ac3983b1f Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 7 Jan 2020 22:22:44 +0800
Subject: io_uring: Remove unnecessary null check

Null check kfree is redundant, so remove it.
This is detected by coccinelle.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index edf072b6cb8f..00426b686092 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1160,8 +1160,7 @@ static void __io_req_aux_free(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (req->io)
-		kfree(req->io);
+	kfree(req->io);
 	if (req->file) {
 		if (req->flags & REQ_F_FIXED_FILE)
 			percpu_ref_put(&ctx->file_data->refs);
-- 
cgit 


From c150368b496837cb207712e78f903ccfd7633b93 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 08:26:07 -0700
Subject: io_uring: file set registration should use interruptible waits

If an application attempts to register a set with unbounded requests
pending, we can be stuck here forever if they don't complete. We can
make this wait interruptible, and just abort if we get signaled.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 00426b686092..2c036972930f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6524,8 +6524,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		 * after we've killed the percpu ref.
 		 */
 		mutex_unlock(&ctx->uring_lock);
-		wait_for_completion(&ctx->completions[0]);
+		ret = wait_for_completion_interruptible(&ctx->completions[0]);
 		mutex_lock(&ctx->uring_lock);
+		if (ret) {
+			percpu_ref_resurrect(&ctx->refs);
+			ret = -EINTR;
+			goto out;
+		}
 	}
 
 	switch (opcode) {
@@ -6571,8 +6576,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	if (opcode != IORING_UNREGISTER_FILES &&
 	    opcode != IORING_REGISTER_FILES_UPDATE) {
 		/* bring the ctx back to life */
-		reinit_completion(&ctx->completions[0]);
 		percpu_ref_reinit(&ctx->refs);
+out:
+		reinit_completion(&ctx->completions[0]);
 	}
 	return ret;
 }
-- 
cgit 


From 69b3e546139a21b3046b6bf0cb79d5e8c9a3fa75 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 11:01:46 -0700
Subject: io_uring: change io_ring_ctx bool fields into bit fields

In preparation for adding another one, which would make us spill into
another long (and hence bump the size of the ctx), change them to
bit fields.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c036972930f..42bf83b3fbd5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,10 +202,10 @@ struct io_ring_ctx {
 
 	struct {
 		unsigned int		flags;
-		bool			compat;
-		bool			account_mem;
-		bool			cq_overflow_flushed;
-		bool			drain_next;
+		int			compat: 1;
+		int			account_mem: 1;
+		int			cq_overflow_flushed: 1;
+		int			drain_next: 1;
 
 		/*
 		 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -994,7 +994,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	/* if force is set, the ring is going away. always drop after that */
 	if (force)
-		ctx->cq_overflow_flushed = true;
+		ctx->cq_overflow_flushed = 1;
 
 	cqe = NULL;
 	while (!list_empty(&ctx->cq_overflow_list)) {
@@ -4489,9 +4489,9 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->ctx->drain_next)) {
 		req->flags |= REQ_F_IO_DRAIN;
-		req->ctx->drain_next = false;
+		req->ctx->drain_next = 0;
 	}
-	req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
+	req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK) != 0;
 
 	ret = io_req_defer(req, sqe);
 	if (ret) {
-- 
cgit 


From f2842ab5b72d7ee5f7f8385c2d4f32c133f5837b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 11:04:00 -0700
Subject: io_uring: enable option to only trigger eventfd for async completions

If an application is using eventfd notifications with poll to know when
new SQEs can be issued, it's expecting the following read/writes to
complete inline. And with that, it knows that there are events available,
and don't want spurious wakeups on the eventfd for those requests.

This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like
IORING_REGISTER_EVENTFD, except it only triggers notifications for events
that happen from async completions (IRQ, or io-wq worker completions).
Any completions inline from the submission itself will not trigger
notifications.

Suggested-by: Mark Papadakis <markuspapadakis@icloud.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 42bf83b3fbd5..70656762244f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -206,6 +206,7 @@ struct io_ring_ctx {
 		int			account_mem: 1;
 		int			cq_overflow_flushed: 1;
 		int			drain_next: 1;
+		int			eventfd_async: 1;
 
 		/*
 		 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -963,13 +964,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
+static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+{
+	if (!ctx->eventfd_async)
+		return true;
+	return io_wq_current_is_worker() || in_interrupt();
+}
+
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd)
+	if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
@@ -6556,10 +6564,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_files_update(ctx, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
+	case IORING_REGISTER_EVENTFD_ASYNC:
 		ret = -EINVAL;
 		if (nr_args != 1)
 			break;
 		ret = io_eventfd_register(ctx, arg);
+		if (ret)
+			break;
+		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
+			ctx->eventfd_async = 1;
+		else
+			ctx->eventfd_async = 0;
 		break;
 	case IORING_UNREGISTER_EVENTFD:
 		ret = -EINVAL;
-- 
cgit 


From c12cedf24e786509de031a832e6b0e5f8b3ca37b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 17:41:21 -0700
Subject: io_uring: add 'struct open_how' to the openat request context

We'll need this for openat2(2) support, remove flags and mode from
the existing io_open struct.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 70656762244f..d8d252b4c9be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -390,13 +390,12 @@ struct io_open {
 	struct file			*file;
 	int				dfd;
 	union {
-		umode_t			mode;
 		unsigned		mask;
 	};
 	const char __user		*fname;
 	struct filename			*filename;
 	struct statx __user		*buffer;
-	int				flags;
+	struct open_how			how;
 };
 
 struct io_files_update {
@@ -2474,9 +2473,9 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	req->open.dfd = READ_ONCE(sqe->fd);
-	req->open.mode = READ_ONCE(sqe->len);
+	req->open.how.mode = READ_ONCE(sqe->len);
 	req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-	req->open.flags = READ_ONCE(sqe->open_flags);
+	req->open.how.flags = READ_ONCE(sqe->open_flags);
 
 	req->open.filename = getname(req->open.fname);
 	if (IS_ERR(req->open.filename)) {
@@ -2501,7 +2500,7 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
 		return -EAGAIN;
 	}
 
-	how = build_open_how(req->open.flags, req->open.mode);
+	how = build_open_how(req->open.how.flags, req->open.how.mode);
 	ret = build_open_flags(&how, &op);
 	if (ret)
 		goto err;
@@ -2604,9 +2603,9 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	req->open.mask = READ_ONCE(sqe->len);
 	req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	req->open.flags = READ_ONCE(sqe->statx_flags);
+	req->open.how.flags = READ_ONCE(sqe->statx_flags);
 
-	if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.flags))
+	if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
 		return -EINVAL;
 
 	req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL);
@@ -2631,7 +2630,7 @@ static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
 	if (force_nonblock)
 		return -EAGAIN;
 
-	if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->flags))
+	if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
 		return -EINVAL;
 
 retry:
@@ -2643,7 +2642,7 @@ retry:
 	if (ret)
 		goto err;
 
-	ret = vfs_getattr(&path, &stat, ctx->mask, ctx->flags);
+	ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
 	path_put(&path);
 	if (retry_estale(ret, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
-- 
cgit 


From f8748881b17dc56b3faa1d30c823f071c56593e5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 17:47:02 -0700
Subject: io_uring: remove 'fname' from io_open structure

We only use it internally in the prep functions for both statx and
openat, so we don't need it to be persistent across the request.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d8d252b4c9be..3a57ea98fe3a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -392,7 +392,6 @@ struct io_open {
 	union {
 		unsigned		mask;
 	};
-	const char __user		*fname;
 	struct filename			*filename;
 	struct statx __user		*buffer;
 	struct open_how			how;
@@ -2467,6 +2466,7 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
 
 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	const char __user *fname;
 	int ret;
 
 	if (sqe->ioprio || sqe->buf_index)
@@ -2474,10 +2474,10 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	req->open.dfd = READ_ONCE(sqe->fd);
 	req->open.how.mode = READ_ONCE(sqe->len);
-	req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	req->open.how.flags = READ_ONCE(sqe->open_flags);
 
-	req->open.filename = getname(req->open.fname);
+	req->open.filename = getname(fname);
 	if (IS_ERR(req->open.filename)) {
 		ret = PTR_ERR(req->open.filename);
 		req->open.filename = NULL;
@@ -2593,6 +2593,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
 
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	const char __user *fname;
 	unsigned lookup_flags;
 	int ret;
 
@@ -2601,14 +2602,14 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	req->open.dfd = READ_ONCE(sqe->fd);
 	req->open.mask = READ_ONCE(sqe->len);
-	req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	req->open.how.flags = READ_ONCE(sqe->statx_flags);
 
 	if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
 		return -EINVAL;
 
-	req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL);
+	req->open.filename = getname_flags(fname, lookup_flags, NULL);
 	if (IS_ERR(req->open.filename)) {
 		ret = PTR_ERR(req->open.filename);
 		req->open.filename = NULL;
-- 
cgit 


From cebdb98617ae3e842c81c73758a185248b37cfd6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 17:59:24 -0700
Subject: io_uring: add support for IORING_OP_OPENAT2

Add support for the new openat2(2) system call. It's trivial to do, as
we can have openat(2) just be wrapped around it.

Suggested-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 6 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3a57ea98fe3a..0b30b0cf8af5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -707,6 +707,11 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
+	{
+		/* IORING_OP_OPENAT2 */
+		.needs_file		= 1,
+		.fd_non_neg		= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2487,11 +2492,46 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
-		     bool force_nonblock)
+static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct open_how __user *how;
+	const char __user *fname;
+	size_t len;
+	int ret;
+
+	if (sqe->ioprio || sqe->buf_index)
+		return -EINVAL;
+
+	req->open.dfd = READ_ONCE(sqe->fd);
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	len = READ_ONCE(sqe->len);
+
+	if (len < OPEN_HOW_SIZE_VER0)
+		return -EINVAL;
+
+	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
+					len);
+	if (ret)
+		return ret;
+
+	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
+		req->open.how.flags |= O_LARGEFILE;
+
+	req->open.filename = getname(fname);
+	if (IS_ERR(req->open.filename)) {
+		ret = PTR_ERR(req->open.filename);
+		req->open.filename = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
 {
 	struct open_flags op;
-	struct open_how how;
 	struct file *file;
 	int ret;
 
@@ -2500,12 +2540,11 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
 		return -EAGAIN;
 	}
 
-	how = build_open_how(req->open.how.flags, req->open.how.mode);
-	ret = build_open_flags(&how, &op);
+	ret = build_open_flags(&req->open.how, &op);
 	if (ret)
 		goto err;
 
-	ret = get_unused_fd_flags(how.flags);
+	ret = get_unused_fd_flags(req->open.how.flags);
 	if (ret < 0)
 		goto err;
 
@@ -2526,6 +2565,13 @@ err:
 	return 0;
 }
 
+static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
+		     bool force_nonblock)
+{
+	req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
+	return io_openat2(req, nxt, force_nonblock);
+}
+
 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
@@ -3984,6 +4030,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_MADVISE:
 		ret = io_madvise_prep(req, sqe);
 		break;
+	case IORING_OP_OPENAT2:
+		ret = io_openat2_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -4204,6 +4253,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_madvise(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_OPENAT2:
+		if (sqe) {
+			ret = io_openat2_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_openat2(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit 


From 354420f705ccd0aa2d41249f3bb55b4afbed1873 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 18:55:15 -0700
Subject: io_uring: add opcode to issue trace event

For some test apps at least, user_data is just zeroes. So it's not a
good way to tell what the command actually is. Add the opcode to the
issue trace point.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0b30b0cf8af5..50233efd9445 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4810,7 +4810,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		req->has_user = *mm != NULL;
 		req->in_async = async;
 		req->needs_fixed_file = async;
-		trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
+		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
+						true, async);
 		if (!io_submit_sqe(req, sqe, statep, &link))
 			break;
 	}
-- 
cgit 


From 10fef4bebf979bb705feed087611293d5864adfe Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Jan 2020 07:52:28 -0700
Subject: io_uring: account fixed file references correctly in batch

We can't assume that the whole batch has fixed files in it. If it's a
mix, or none at all, then we can end up doing a ref put that either
messes up accounting, or causes an oops if we have no fixed files at
all.

Also ensure we free requests properly between inflight accounted and
normal requests.

Fixes: 82c721577011 ("io_uring: extend batch freeing to cover more cases")
Reported-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Reported-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 50233efd9445..8a645a37b4c7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1207,21 +1207,24 @@ struct req_batch {
 
 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
+	int fixed_refs = rb->to_free;
+
 	if (!rb->to_free)
 		return;
 	if (rb->need_iter) {
 		int i, inflight = 0;
 		unsigned long flags;
 
+		fixed_refs = 0;
 		for (i = 0; i < rb->to_free; i++) {
 			struct io_kiocb *req = rb->reqs[i];
 
-			if (req->flags & REQ_F_FIXED_FILE)
+			if (req->flags & REQ_F_FIXED_FILE) {
 				req->file = NULL;
+				fixed_refs++;
+			}
 			if (req->flags & REQ_F_INFLIGHT)
 				inflight++;
-			else
-				rb->reqs[i] = NULL;
 			__io_req_aux_free(req);
 		}
 		if (!inflight)
@@ -1231,7 +1234,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 		for (i = 0; i < rb->to_free; i++) {
 			struct io_kiocb *req = rb->reqs[i];
 
-			if (req) {
+			if (req->flags & REQ_F_INFLIGHT) {
 				list_del(&req->inflight_entry);
 				if (!--inflight)
 					break;
@@ -1244,8 +1247,9 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 	}
 do_free:
 	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	if (fixed_refs)
+		percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
 	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	percpu_ref_put_many(&ctx->file_data->refs, rb->to_free);
 	rb->to_free = rb->need_iter = 0;
 }
 
-- 
cgit 


From 66f4af93da5761d2fa05c0dc673a47003cdb9cfe Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 16 Jan 2020 15:36:52 -0700
Subject: io_uring: add support for probing opcodes

The application currently has no way of knowing if a given opcode is
supported or not without having to try and issue one and see if we get
-EINVAL or not. And even this approach is fraught with peril, as maybe
we're getting -EINVAL due to some fields being missing, or maybe it's
just not that easy to issue that particular command without doing some
other leg work in terms of setup first.

This adds IORING_REGISTER_PROBE, which fills in a structure with info
on what it supported or not. This will work even with sparse opcode
fields, which may happen in the future or even today if someone
backports specific features to older kernels.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8a645a37b4c7..7715a729271a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -561,6 +561,8 @@ struct io_op_def {
 	unsigned		hash_reg_file : 1;
 	/* unbound wq insertion if file is a non-regular file */
 	unsigned		unbound_nonreg_file : 1;
+	/* opcode is not supported by this kernel */
+	unsigned		not_supported : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -6566,6 +6568,45 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 	return io_uring_setup(entries, params);
 }
 
+static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+{
+	struct io_uring_probe *p;
+	size_t size;
+	int i, ret;
+
+	size = struct_size(p, ops, nr_args);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+	p = kzalloc(size, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(p, arg, size))
+		goto out;
+	ret = -EINVAL;
+	if (memchr_inv(p, 0, size))
+		goto out;
+
+	p->last_op = IORING_OP_LAST - 1;
+	if (nr_args > IORING_OP_LAST)
+		nr_args = IORING_OP_LAST;
+
+	for (i = 0; i < nr_args; i++) {
+		p->ops[i].op = i;
+		if (!io_op_defs[i].not_supported)
+			p->ops[i].flags = IO_URING_OP_SUPPORTED;
+	}
+	p->ops_len = i;
+
+	ret = 0;
+	if (copy_to_user(arg, p, size))
+		ret = -EFAULT;
+out:
+	kfree(p);
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -6582,7 +6623,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		return -ENXIO;
 
 	if (opcode != IORING_UNREGISTER_FILES &&
-	    opcode != IORING_REGISTER_FILES_UPDATE) {
+	    opcode != IORING_REGISTER_FILES_UPDATE &&
+	    opcode != IORING_REGISTER_PROBE) {
 		percpu_ref_kill(&ctx->refs);
 
 		/*
@@ -6644,6 +6686,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_eventfd_unregister(ctx);
 		break;
+	case IORING_REGISTER_PROBE:
+		ret = -EINVAL;
+		if (!arg || nr_args > 256)
+			break;
+		ret = io_probe(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -6651,7 +6699,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 
 
 	if (opcode != IORING_UNREGISTER_FILES &&
-	    opcode != IORING_REGISTER_FILES_UPDATE) {
+	    opcode != IORING_REGISTER_FILES_UPDATE &&
+	    opcode != IORING_REGISTER_PROBE) {
 		/* bring the ctx back to life */
 		percpu_ref_reinit(&ctx->refs);
 out:
-- 
cgit 


From 711be0312df4d350fb5bf1671c132cccae5aaf9a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jan 2020 03:57:59 +0300
Subject: io_uring: optimise use of ctx->drain_next

Move setting ctx->drain_next to the only place it could be set, when it
got linked non-head requests. The same for checking it, it's interesting
only for a head of a link or a non-linked request.

No functional changes here. This removes some code from the common path
and also removes REQ_F_DRAIN_LINK flag, as it doesn't need it anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7715a729271a..68843a123000 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -506,7 +506,6 @@ struct io_kiocb {
 #define REQ_F_LINK		64	/* linked sqes */
 #define REQ_F_LINK_TIMEOUT	128	/* has linked timeout */
 #define REQ_F_FAIL_LINK		256	/* fail rest of links */
-#define REQ_F_DRAIN_LINK	512	/* link should be fully drained */
 #define REQ_F_TIMEOUT		1024	/* timeout request */
 #define REQ_F_ISREG		2048	/* regular file */
 #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
@@ -4558,12 +4557,6 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	int ret;
 
-	if (unlikely(req->ctx->drain_next)) {
-		req->flags |= REQ_F_IO_DRAIN;
-		req->ctx->drain_next = 0;
-	}
-	req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK) != 0;
-
 	ret = io_req_defer(req, sqe);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
@@ -4630,8 +4623,10 @@ err_req:
 	if (*link) {
 		struct io_kiocb *head = *link;
 
-		if (sqe_flags & IOSQE_IO_DRAIN)
-			head->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+		if (sqe_flags & IOSQE_IO_DRAIN) {
+			head->flags |= REQ_F_IO_DRAIN;
+			ctx->drain_next = 1;
+		}
 
 		if (sqe_flags & IOSQE_IO_HARDLINK)
 			req->flags |= REQ_F_HARDLINK;
@@ -4655,18 +4650,24 @@ err_req:
 			io_queue_link_head(head);
 			*link = NULL;
 		}
-	} else if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
-		req->flags |= REQ_F_LINK;
-		if (sqe_flags & IOSQE_IO_HARDLINK)
-			req->flags |= REQ_F_HARDLINK;
-
-		INIT_LIST_HEAD(&req->link_list);
-		ret = io_req_defer_prep(req, sqe);
-		if (ret)
-			req->flags |= REQ_F_FAIL_LINK;
-		*link = req;
 	} else {
-		io_queue_sqe(req, sqe);
+		if (unlikely(ctx->drain_next)) {
+			req->flags |= REQ_F_IO_DRAIN;
+			req->ctx->drain_next = 0;
+		}
+		if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
+			req->flags |= REQ_F_LINK;
+			if (sqe_flags & IOSQE_IO_HARDLINK)
+				req->flags |= REQ_F_HARDLINK;
+
+			INIT_LIST_HEAD(&req->link_list);
+			ret = io_req_defer_prep(req, sqe);
+			if (ret)
+				req->flags |= REQ_F_FAIL_LINK;
+			*link = req;
+		} else {
+			io_queue_sqe(req, sqe);
+		}
 	}
 
 	return true;
-- 
cgit 


From 0791015837f1520dd72918355dcb1f1e79175255 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jan 2020 03:52:46 +0300
Subject: io_uring: remove extra check in __io_commit_cqring

__io_commit_cqring() is almost always called when there is a change in
the rings, so the check is rather pessimising.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 68843a123000..771954bb4c38 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -860,14 +860,12 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 
-	if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
-		/* order cqe stores with ring update */
-		smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
+	/* order cqe stores with ring update */
+	smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 
-		if (wq_has_sleeper(&ctx->cq_wait)) {
-			wake_up_interruptible(&ctx->cq_wait);
-			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
-		}
+	if (wq_has_sleeper(&ctx->cq_wait)) {
+		wake_up_interruptible(&ctx->cq_wait);
+		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 	}
 }
 
-- 
cgit 


From b14cca0c84c760fbd39ad6bb7e1181e2df103d25 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 17 Jan 2020 04:45:59 +0300
Subject: io_uring: hide uring_fd in ctx

req->ring_fd and req->ring_file are used only during the prep stage
during submission, which is is protected by mutex. There is no need
to store them per-request, place them in ctx.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 771954bb4c38..46cc1bc48062 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -251,6 +251,8 @@ struct io_ring_ctx {
 	 */
 	struct fixed_file_data	*file_data;
 	unsigned		nr_user_files;
+	int 			ring_fd;
+	struct file 		*ring_file;
 
 	/* if used, fixed mapped user buffers */
 	unsigned		nr_user_bufs;
@@ -475,15 +477,10 @@ struct io_kiocb {
 	};
 
 	struct io_async_ctx		*io;
-	union {
-		/*
-		 * ring_file is only used in the submission path, and
-		 * llist_node is only used for poll deferred completions
-		 */
-		struct file		*ring_file;
-		struct llist_node	llist_node;
-	};
-	int				ring_fd;
+	/*
+	 * llist_node is only used for poll deferred completions
+	 */
+	struct llist_node		llist_node;
 	bool				has_user;
 	bool				in_async;
 	bool				needs_fixed_file;
@@ -1141,7 +1138,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 
 got_it:
 	req->io = NULL;
-	req->ring_file = NULL;
 	req->file = NULL;
 	req->ctx = ctx;
 	req->flags = 0;
@@ -2725,7 +2721,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	req->close.fd = READ_ONCE(sqe->fd);
 	if (req->file->f_op == &io_uring_fops ||
-	    req->close.fd == req->ring_fd)
+	    req->close.fd == req->ctx->ring_fd)
 		return -EBADF;
 
 	return 0;
@@ -4395,7 +4391,7 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!req->ring_file)
+	if (!ctx->ring_file)
 		return -EBADF;
 
 	rcu_read_lock();
@@ -4406,7 +4402,7 @@ static int io_grab_files(struct io_kiocb *req)
 	 * the fd has changed since we started down this path, and disallow
 	 * this operation if it has.
 	 */
-	if (fcheck(req->ring_fd) == req->ring_file) {
+	if (fcheck(ctx->ring_fd) == ctx->ring_file) {
 		list_add(&req->inflight_entry, &ctx->inflight_list);
 		req->flags |= REQ_F_INFLIGHT;
 		req->work.files = current->files;
@@ -4778,6 +4774,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		statep = &state;
 	}
 
+	ctx->ring_fd = ring_fd;
+	ctx->ring_file = ring_file;
+
 	for (i = 0; i < nr; i++) {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
@@ -4810,8 +4809,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
-		req->ring_file = ring_file;
-		req->ring_fd = ring_fd;
 		req->has_user = *mm != NULL;
 		req->in_async = async;
 		req->needs_fixed_file = async;
-- 
cgit 


From e46a7950d362231a4d0b078af5f4c109b8e5ac9e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 17 Jan 2020 11:15:34 -0700
Subject: io_uring: file switch work needs to get flushed on exit

We currently flush early, but if we have something in progress and a
new switch is scheduled, we need to ensure to flush after our teardown
as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 46cc1bc48062..a0a58c181eaf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5089,11 +5089,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		return -ENXIO;
 
 	/* protect against inflight atomic switch, which drops the ref */
-	flush_work(&data->ref_work);
 	percpu_ref_get(&data->refs);
+	/* wait for existing switches */
+	flush_work(&data->ref_work);
 	percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
 	wait_for_completion(&data->done);
 	percpu_ref_put(&data->refs);
+	/* flush potential new switch */
+	flush_work(&data->ref_work);
 	percpu_ref_exit(&data->refs);
 
 	__io_sqe_files_unregister(ctx);
-- 
cgit 


From 87987898a1dbc69b1138f7c10eb9abd655c03396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jan 2020 01:22:30 +0300
Subject: io_uring: remove REQ_F_IO_DRAINED

A request can get into the defer list only once, there is no need for
marking it as drained, so remove it. This probably was left after
extracting __need_defer() for use in timeouts.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a0a58c181eaf..ed1adeda370e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -499,7 +499,6 @@ struct io_kiocb {
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_LINK_NEXT		8	/* already grabbed next link */
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
-#define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */
 #define REQ_F_LINK_TIMEOUT	128	/* has linked timeout */
 #define REQ_F_FAIL_LINK		256	/* fail rest of links */
@@ -817,7 +816,7 @@ static inline bool __req_need_defer(struct io_kiocb *req)
 
 static inline bool req_need_defer(struct io_kiocb *req)
 {
-	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
+	if (unlikely(req->flags & REQ_F_IO_DRAIN))
 		return __req_need_defer(req);
 
 	return false;
@@ -939,10 +938,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
 
 	__io_commit_cqring(ctx);
 
-	while ((req = io_get_deferred_req(ctx)) != NULL) {
-		req->flags |= REQ_F_IO_DRAINED;
+	while ((req = io_get_deferred_req(ctx)) != NULL)
 		io_queue_async_work(req);
-	}
 }
 
 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
-- 
cgit 


From 6b47ee6ecab142f938a40bf3b297abac74218ee2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jan 2020 20:22:41 +0300
Subject: io_uring: optimise sqe-to-req flags translation

For each IOSQE_* flag there is a corresponding REQ_F_* flag. And there
is a repetitive pattern of their translation:
e.g. if (sqe->flags & SQE_FLAG*) req->flags |= REQ_F_FLAG*

Use same numeric values/bits for them and copy instead of manual
handling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 92 ++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 63 insertions(+), 29 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ed1adeda370e..cf5bad51f752 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -46,6 +46,7 @@
 #include <linux/compat.h>
 #include <linux/refcount.h>
 #include <linux/uio.h>
+#include <linux/bits.h>
 
 #include <linux/sched/signal.h>
 #include <linux/fs.h>
@@ -452,6 +453,65 @@ struct io_async_ctx {
 	};
 };
 
+enum {
+	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
+	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
+	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
+	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
+	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
+
+	REQ_F_LINK_NEXT_BIT,
+	REQ_F_FAIL_LINK_BIT,
+	REQ_F_INFLIGHT_BIT,
+	REQ_F_CUR_POS_BIT,
+	REQ_F_NOWAIT_BIT,
+	REQ_F_IOPOLL_COMPLETED_BIT,
+	REQ_F_LINK_TIMEOUT_BIT,
+	REQ_F_TIMEOUT_BIT,
+	REQ_F_ISREG_BIT,
+	REQ_F_MUST_PUNT_BIT,
+	REQ_F_TIMEOUT_NOSEQ_BIT,
+	REQ_F_COMP_LOCKED_BIT,
+};
+
+enum {
+	/* ctx owns file */
+	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
+	/* drain existing IO first */
+	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
+	/* linked sqes */
+	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
+	/* doesn't sever on completion < 0 */
+	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
+	/* IOSQE_ASYNC */
+	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
+
+	/* already grabbed next link */
+	REQ_F_LINK_NEXT		= BIT(REQ_F_LINK_NEXT_BIT),
+	/* fail rest of links */
+	REQ_F_FAIL_LINK		= BIT(REQ_F_FAIL_LINK_BIT),
+	/* on inflight list */
+	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
+	/* read/write uses file position */
+	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
+	/* must not punt to workers */
+	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
+	/* polled IO has completed */
+	REQ_F_IOPOLL_COMPLETED	= BIT(REQ_F_IOPOLL_COMPLETED_BIT),
+	/* has linked timeout */
+	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
+	/* timeout request */
+	REQ_F_TIMEOUT		= BIT(REQ_F_TIMEOUT_BIT),
+	/* regular file */
+	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
+	/* must be punted even for NONBLOCK */
+	REQ_F_MUST_PUNT		= BIT(REQ_F_MUST_PUNT_BIT),
+	/* no timeout sequence */
+	REQ_F_TIMEOUT_NOSEQ	= BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
+	/* completion under lock */
+	REQ_F_COMP_LOCKED	= BIT(REQ_F_COMP_LOCKED_BIT),
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -494,23 +554,6 @@ struct io_kiocb {
 	struct list_head	link_list;
 	unsigned int		flags;
 	refcount_t		refs;
-#define REQ_F_NOWAIT		1	/* must not punt to workers */
-#define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
-#define REQ_F_FIXED_FILE	4	/* ctx owns file */
-#define REQ_F_LINK_NEXT		8	/* already grabbed next link */
-#define REQ_F_IO_DRAIN		16	/* drain existing IO first */
-#define REQ_F_LINK		64	/* linked sqes */
-#define REQ_F_LINK_TIMEOUT	128	/* has linked timeout */
-#define REQ_F_FAIL_LINK		256	/* fail rest of links */
-#define REQ_F_TIMEOUT		1024	/* timeout request */
-#define REQ_F_ISREG		2048	/* regular file */
-#define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
-#define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
-#define REQ_F_INFLIGHT		16384	/* on inflight list */
-#define REQ_F_COMP_LOCKED	32768	/* completion under lock */
-#define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
-#define REQ_F_FORCE_ASYNC	131072	/* IOSQE_ASYNC */
-#define REQ_F_CUR_POS		262144	/* read/write uses file position */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -4355,9 +4398,6 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 	flags = READ_ONCE(sqe->flags);
 	fd = READ_ONCE(sqe->fd);
 
-	if (flags & IOSQE_IO_DRAIN)
-		req->flags |= REQ_F_IO_DRAIN;
-
 	if (!io_req_needs_file(req, fd))
 		return 0;
 
@@ -4593,8 +4633,9 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		ret = -EINVAL;
 		goto err_req;
 	}
-	if (sqe_flags & IOSQE_ASYNC)
-		req->flags |= REQ_F_FORCE_ASYNC;
+	/* same numerical values with corresponding REQ_F_*, safe to copy */
+	req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
+					IOSQE_ASYNC);
 
 	ret = io_req_set_file(state, req, sqe);
 	if (unlikely(ret)) {
@@ -4618,10 +4659,6 @@ err_req:
 			head->flags |= REQ_F_IO_DRAIN;
 			ctx->drain_next = 1;
 		}
-
-		if (sqe_flags & IOSQE_IO_HARDLINK)
-			req->flags |= REQ_F_HARDLINK;
-
 		if (io_alloc_async_ctx(req)) {
 			ret = -EAGAIN;
 			goto err_req;
@@ -4648,9 +4685,6 @@ err_req:
 		}
 		if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
 			req->flags |= REQ_F_LINK;
-			if (sqe_flags & IOSQE_IO_HARDLINK)
-				req->flags |= REQ_F_HARDLINK;
-
 			INIT_LIST_HEAD(&req->link_list);
 			ret = io_req_defer_prep(req, sqe);
 			if (ret)
-- 
cgit 


From 0463b6c58e557118d602b2f225fa3bbe9b6f3560 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jan 2020 21:35:38 +0300
Subject: io_uring: use labeled array init in io_op_defs

Don't rely on implicit ordering of IORING_OP_ and explicitly place them
at a right place in io_op_defs. Now former comments are now a part of
the code and won't ever outdate.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 91 +++++++++++++++++++----------------------------------------
 1 file changed, 29 insertions(+), 62 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf5bad51f752..09503d1e9e45 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -604,151 +604,118 @@ struct io_op_def {
 };
 
 static const struct io_op_def io_op_defs[] = {
-	{
-		/* IORING_OP_NOP */
-	},
-	{
-		/* IORING_OP_READV */
+	[IORING_OP_NOP] = {},
+	[IORING_OP_READV] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_WRITEV */
+	[IORING_OP_WRITEV] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_FSYNC */
+	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
 	},
-	{
-		/* IORING_OP_READ_FIXED */
+	[IORING_OP_READ_FIXED] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_WRITE_FIXED */
+	[IORING_OP_WRITE_FIXED] = {
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_POLL_ADD */
+	[IORING_OP_POLL_ADD] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_POLL_REMOVE */
-	},
-	{
-		/* IORING_OP_SYNC_FILE_RANGE */
+	[IORING_OP_POLL_REMOVE] = {},
+	[IORING_OP_SYNC_FILE_RANGE] = {
 		.needs_file		= 1,
 	},
-	{
-		/* IORING_OP_SENDMSG */
+	[IORING_OP_SENDMSG] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_RECVMSG */
+	[IORING_OP_RECVMSG] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_TIMEOUT */
+	[IORING_OP_TIMEOUT] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 	},
-	{
-		/* IORING_OP_TIMEOUT_REMOVE */
-	},
-	{
-		/* IORING_OP_ACCEPT */
+	[IORING_OP_TIMEOUT_REMOVE] = {},
+	[IORING_OP_ACCEPT] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_ASYNC_CANCEL */
-	},
-	{
-		/* IORING_OP_LINK_TIMEOUT */
+	[IORING_OP_ASYNC_CANCEL] = {},
+	[IORING_OP_LINK_TIMEOUT] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 	},
-	{
-		/* IORING_OP_CONNECT */
+	[IORING_OP_CONNECT] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_FALLOCATE */
+	[IORING_OP_FALLOCATE] = {
 		.needs_file		= 1,
 	},
-	{
-		/* IORING_OP_OPENAT */
+	[IORING_OP_OPENAT] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
 	},
-	{
-		/* IORING_OP_CLOSE */
+	[IORING_OP_CLOSE] = {
 		.needs_file		= 1,
 	},
-	{
-		/* IORING_OP_FILES_UPDATE */
+	[IORING_OP_FILES_UPDATE] = {
 		.needs_mm		= 1,
 	},
-	{
-		/* IORING_OP_STATX */
+	[IORING_OP_STATX] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
 	},
-	{
-		/* IORING_OP_READ */
+	[IORING_OP_READ] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_WRITE */
+	[IORING_OP_WRITE] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_FADVISE */
+	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
 	},
-	{
-		/* IORING_OP_MADVISE */
+	[IORING_OP_MADVISE] = {
 		.needs_mm		= 1,
 	},
-	{
-		/* IORING_OP_SEND */
+	[IORING_OP_SEND] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_RECV */
+	[IORING_OP_RECV] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
-	{
-		/* IORING_OP_OPENAT2 */
+	[IORING_OP_OPENAT2] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
 	},
-- 
cgit 


From 1118591ab883f46df4ab614cc976bc4c8e04a464 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 22 Jan 2020 23:09:35 +0300
Subject: io_uring: prep req when do IOSQE_ASYNC

Whenever IOSQE_ASYNC is set, requests will be punted to async without
getting into io_issue_req() and without proper preparation done (e.g.
io_req_defer_prep()). Hence they will be left uninitialised.

Prepare them before punting.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 09503d1e9e45..cdbc711ae5fd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4558,11 +4558,15 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	ret = io_req_defer(req, sqe);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
+fail_req:
 			io_cqring_add_event(req, ret);
 			req_set_fail_links(req);
 			io_double_put_req(req);
 		}
 	} else if (req->flags & REQ_F_FORCE_ASYNC) {
+		ret = io_req_defer_prep(req, sqe);
+		if (unlikely(ret < 0))
+			goto fail_req;
 		/*
 		 * Never try inline submit of IOSQE_ASYNC is set, go straight
 		 * to async execution.
-- 
cgit 


From 86a761f81ec87a96572214f5db606f60d36aaf08 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 22 Jan 2020 23:09:36 +0300
Subject: io_uring: honor IOSQE_ASYNC for linked reqs

REQ_F_FORCE_ASYNC is checked only for the head of a link. Fix it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cdbc711ae5fd..9f73586dcfb8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4512,6 +4512,7 @@ again:
 	 */
 	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
 	    (req->flags & REQ_F_MUST_PUNT))) {
+punt:
 		if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
 			ret = io_grab_files(req);
 			if (ret)
@@ -4547,6 +4548,9 @@ done_req:
 	if (nxt) {
 		req = nxt;
 		nxt = NULL;
+
+		if (req->flags & REQ_F_FORCE_ASYNC)
+			goto punt;
 		goto again;
 	}
 }
-- 
cgit 


From 980ad26304abf11e78caaa68023411b9c088b848 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 24 Jan 2020 23:08:54 -0700
Subject: io_uring: don't attempt to copy iovec for READ/WRITE

For the non-vectored variant of READV/WRITEV, we don't need to setup an
async io context, and we flag that appropriately in the io_op_defs
array. However, in fixing this for the 5.5 kernel in commit 74566df3a71c
we didn't have these opcodes, so the check there was added just for the
READ_FIXED and WRITE_FIXED opcodes. Replace that check with just a
single check for needing async context, that covers all four of these
read/write variants that don't use an iovec.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9f73586dcfb8..4bde5da2c2f5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2112,8 +2112,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 			     struct iovec *iovec, struct iovec *fast_iov,
 			     struct iov_iter *iter)
 {
-	if (req->opcode == IORING_OP_READ_FIXED ||
-	    req->opcode == IORING_OP_WRITE_FIXED)
+	if (!io_op_defs[req->opcode].async_ctx)
 		return 0;
 	if (!req->io && io_alloc_async_ctx(req))
 		return -ENOMEM;
-- 
cgit 


From 8cdf2193a3335b4cfb6e023b41ac293d0843d287 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jan 2020 00:40:24 +0300
Subject: io_uring: add comment for drain_next

Draining the middle of a link is tricky, so leave a comment there

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4bde5da2c2f5..a700ee5fc89d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4629,6 +4629,13 @@ err_req:
 	if (*link) {
 		struct io_kiocb *head = *link;
 
+		/*
+		 * Taking sequential execution of a link, draining both sides
+		 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
+		 * requests in the link. So, it drains the head and the
+		 * next after the link request. The last one is done via
+		 * drain_next flag to persist the effect across calls.
+		 */
 		if (sqe_flags & IOSQE_IO_DRAIN) {
 			head->flags |= REQ_F_IO_DRAIN;
 			ctx->drain_next = 1;
-- 
cgit 


From 9466f43741bc08edd7b1bee642dd6f5561091634 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jan 2020 22:34:01 +0300
Subject: io_uring: fix refcounting with batched allocations at OOM

In case of out of memory the second argument of percpu_ref_put_many() in
io_submit_sqes() may evaluate into "nr - (-EAGAIN)", that is clearly
wrong.

Fixes: 2b85edfc0c90 ("io_uring: batch getting pcpu references")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a700ee5fc89d..1dd20305c664 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4830,8 +4830,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			break;
 	}
 
-	if (submitted != nr)
-		percpu_ref_put_many(&ctx->refs, nr - submitted);
+	if (unlikely(submitted != nr)) {
+		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
+
+		percpu_ref_put_many(&ctx->refs, nr - ref_used);
+	}
 	if (link)
 		io_queue_link_head(link);
 	if (statep)
-- 
cgit 


From cccf0ee834559ae0b327b40290e14f6a2a017177 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 27 Jan 2020 16:34:48 -0700
Subject: io_uring/io-wq: don't use static creds/mm assignments

We currently setup the io_wq with a static set of mm and creds. Even for
a single-use io-wq per io_uring, this is suboptimal as we have may have
multiple enters of the ring. For sharing the io-wq backend, it doesn't
work at all.

Switch to passing in the creds and mm when the work item is setup. This
means that async work is no longer deferred to the io_uring mm and creds,
it is done with the current mm and creds.

Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know
they can rely on the current personality (mm and creds) being the same
for direct issue and async issue.

Reviewed-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1dd20305c664..0ea36911745d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -875,6 +875,29 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
+static inline void io_req_work_grab_env(struct io_kiocb *req,
+					const struct io_op_def *def)
+{
+	if (!req->work.mm && def->needs_mm) {
+		mmgrab(current->mm);
+		req->work.mm = current->mm;
+	}
+	if (!req->work.creds)
+		req->work.creds = get_current_cred();
+}
+
+static inline void io_req_work_drop_env(struct io_kiocb *req)
+{
+	if (req->work.mm) {
+		mmdrop(req->work.mm);
+		req->work.mm = NULL;
+	}
+	if (req->work.creds) {
+		put_cred(req->work.creds);
+		req->work.creds = NULL;
+	}
+}
+
 static inline bool io_prep_async_work(struct io_kiocb *req,
 				      struct io_kiocb **link)
 {
@@ -888,8 +911,8 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
 		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
-	if (def->needs_mm)
-		req->work.flags |= IO_WQ_WORK_NEEDS_USER;
+
+	io_req_work_grab_env(req, def);
 
 	*link = io_prep_linked_timeout(req);
 	return do_hashed;
@@ -1180,6 +1203,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
 		else
 			fput(req->file);
 	}
+
+	io_req_work_drop_env(req);
 }
 
 static void __io_free_req(struct io_kiocb *req)
@@ -3963,6 +3988,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
 {
 	ssize_t ret = 0;
 
+	io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+
 	switch (req->opcode) {
 	case IORING_OP_NOP:
 		break;
@@ -5725,9 +5752,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 		goto err;
 	}
 
-	data.mm = ctx->sqo_mm;
 	data.user = ctx->user;
-	data.creds = ctx->creds;
 	data.get_work = io_get_work;
 	data.put_work = io_put_work;
 
@@ -6535,7 +6560,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		goto err;
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
-			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
+			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
+			IORING_FEAT_CUR_PERSONALITY;
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 	return ret;
 err:
-- 
cgit 


From 24369c2e3bb06d8c4e71fd6ceaf4f8a01ae79b7c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 28 Jan 2020 03:15:48 +0300
Subject: io_uring: add io-wq workqueue sharing

If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to
be a valid io_uring fd io-wq of which will be shared with the newly
created io_uring instance. If the flag is set but it can't share io-wq,
it fails.

This allows creation of "sibling" io_urings, where we prefer to keep the
SQ/CQ private, but want to share the async backend to minimize the amount
of overhead associated with having multiple rings that belong to the same
backend.

Reported-by: Jens Axboe <axboe@kernel.dk>
Reported-by: Daurnimator <quae@daurnimator.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 14 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0ea36911745d..275355bd3a64 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5704,11 +5704,56 @@ static void io_get_work(struct io_wq_work *work)
 	refcount_inc(&req->refs);
 }
 
+static int io_init_wq_offload(struct io_ring_ctx *ctx,
+			      struct io_uring_params *p)
+{
+	struct io_wq_data data;
+	struct fd f;
+	struct io_ring_ctx *ctx_attach;
+	unsigned int concurrency;
+	int ret = 0;
+
+	data.user = ctx->user;
+	data.get_work = io_get_work;
+	data.put_work = io_put_work;
+
+	if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
+		/* Do QD, or 4 * CPUS, whatever is smallest */
+		concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+
+		ctx->io_wq = io_wq_create(concurrency, &data);
+		if (IS_ERR(ctx->io_wq)) {
+			ret = PTR_ERR(ctx->io_wq);
+			ctx->io_wq = NULL;
+		}
+		return ret;
+	}
+
+	f = fdget(p->wq_fd);
+	if (!f.file)
+		return -EBADF;
+
+	if (f.file->f_op != &io_uring_fops) {
+		ret = -EINVAL;
+		goto out_fput;
+	}
+
+	ctx_attach = f.file->private_data;
+	/* @io_wq is protected by holding the fd */
+	if (!io_wq_get(ctx_attach->io_wq, &data)) {
+		ret = -EINVAL;
+		goto out_fput;
+	}
+
+	ctx->io_wq = ctx_attach->io_wq;
+out_fput:
+	fdput(f);
+	return ret;
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
-	struct io_wq_data data;
-	unsigned concurrency;
 	int ret;
 
 	init_waitqueue_head(&ctx->sqo_wait);
@@ -5752,18 +5797,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 		goto err;
 	}
 
-	data.user = ctx->user;
-	data.get_work = io_get_work;
-	data.put_work = io_put_work;
-
-	/* Do QD, or 4 * CPUS, whatever is smallest */
-	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-	ctx->io_wq = io_wq_create(concurrency, &data);
-	if (IS_ERR(ctx->io_wq)) {
-		ret = PTR_ERR(ctx->io_wq);
-		ctx->io_wq = NULL;
+	ret = io_init_wq_offload(ctx, p);
+	if (ret)
 		goto err;
-	}
 
 	return 0;
 err:
@@ -6589,7 +6625,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
-			IORING_SETUP_CLAMP))
+			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
-- 
cgit 


From 071698e13ac6ba786dfa22349a7b62deb5a9464d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 28 Jan 2020 10:04:42 -0700
Subject: io_uring: allow registering credentials

If an application wants to use a ring with different kinds of
credentials, it can register them upfront. We don't lookup credentials,
the credentials of the task calling IORING_REGISTER_PERSONALITY is used.

An 'id' is returned for the application to use in subsequent personality
support.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 68 insertions(+), 7 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 275355bd3a64..d74567fc9628 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -273,6 +273,8 @@ struct io_ring_ctx {
 	struct socket		*ring_sock;
 #endif
 
+	struct idr		personality_idr;
+
 	struct {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
@@ -796,6 +798,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	init_completion(&ctx->completions[0]);
 	init_completion(&ctx->completions[1]);
+	idr_init(&ctx->personality_idr);
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
@@ -6177,6 +6180,17 @@ static int io_uring_fasync(int fd, struct file *file, int on)
 	return fasync_helper(fd, file, on, &ctx->cq_fasync);
 }
 
+static int io_remove_personalities(int id, void *p, void *data)
+{
+	struct io_ring_ctx *ctx = data;
+	const struct cred *cred;
+
+	cred = idr_remove(&ctx->personality_idr, id);
+	if (cred)
+		put_cred(cred);
+	return 0;
+}
+
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -6193,6 +6207,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	/* if we failed setting up the ctx, we might not have any rings */
 	if (ctx->rings)
 		io_cqring_overflow_flush(ctx, true);
+	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
 	wait_for_completion(&ctx->completions[0]);
 	io_ring_ctx_free(ctx);
 }
@@ -6683,6 +6698,45 @@ out:
 	return ret;
 }
 
+static int io_register_personality(struct io_ring_ctx *ctx)
+{
+	const struct cred *creds = get_current_cred();
+	int id;
+
+	id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+				USHRT_MAX, GFP_KERNEL);
+	if (id < 0)
+		put_cred(creds);
+	return id;
+}
+
+static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
+{
+	const struct cred *old_creds;
+
+	old_creds = idr_remove(&ctx->personality_idr, id);
+	if (old_creds) {
+		put_cred(old_creds);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static bool io_register_op_must_quiesce(int op)
+{
+	switch (op) {
+	case IORING_UNREGISTER_FILES:
+	case IORING_REGISTER_FILES_UPDATE:
+	case IORING_REGISTER_PROBE:
+	case IORING_REGISTER_PERSONALITY:
+	case IORING_UNREGISTER_PERSONALITY:
+		return false;
+	default:
+		return true;
+	}
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -6698,9 +6752,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	if (percpu_ref_is_dying(&ctx->refs))
 		return -ENXIO;
 
-	if (opcode != IORING_UNREGISTER_FILES &&
-	    opcode != IORING_REGISTER_FILES_UPDATE &&
-	    opcode != IORING_REGISTER_PROBE) {
+	if (io_register_op_must_quiesce(opcode)) {
 		percpu_ref_kill(&ctx->refs);
 
 		/*
@@ -6768,15 +6820,24 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_probe(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_PERSONALITY:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_register_personality(ctx);
+		break;
+	case IORING_UNREGISTER_PERSONALITY:
+		ret = -EINVAL;
+		if (arg)
+			break;
+		ret = io_unregister_personality(ctx, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
 	}
 
-
-	if (opcode != IORING_UNREGISTER_FILES &&
-	    opcode != IORING_REGISTER_FILES_UPDATE &&
-	    opcode != IORING_REGISTER_PROBE) {
+	if (io_register_op_must_quiesce(opcode)) {
 		/* bring the ctx back to life */
 		percpu_ref_reinit(&ctx->refs);
 out:
-- 
cgit 


From 75c6a03904e0dd414a4d99a3072075cb5117e5bc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 28 Jan 2020 10:15:23 -0700
Subject: io_uring: support using a registered personality for commands

For personalities previously registered via IORING_REGISTER_PERSONALITY,
allow any command to select them. This is done through setting
sqe->personality to the id returned from registration, and then flagging
sqe->flags with IOSQE_PERSONALITY.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d74567fc9628..8bcf0538e2e1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4626,9 +4626,10 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			  struct io_submit_state *state, struct io_kiocb **link)
 {
+	const struct cred *old_creds = NULL;
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned int sqe_flags;
-	int ret;
+	int ret, id;
 
 	sqe_flags = READ_ONCE(sqe->flags);
 
@@ -4637,6 +4638,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		ret = -EINVAL;
 		goto err_req;
 	}
+
+	id = READ_ONCE(sqe->personality);
+	if (id) {
+		const struct cred *personality_creds;
+
+		personality_creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!personality_creds)) {
+			ret = -EINVAL;
+			goto err_req;
+		}
+		old_creds = override_creds(personality_creds);
+	}
+
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
 					IOSQE_ASYNC);
@@ -4646,6 +4660,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 err_req:
 		io_cqring_add_event(req, ret);
 		io_double_put_req(req);
+		if (old_creds)
+			revert_creds(old_creds);
 		return false;
 	}
 
@@ -4706,6 +4722,8 @@ err_req:
 		}
 	}
 
+	if (old_creds)
+		revert_creds(old_creds);
 	return true;
 }
 
-- 
cgit 


From f86cd20c9454847a524ddbdcdec32c0380ed7c9b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 29 Jan 2020 13:46:44 -0700
Subject: io_uring: fix linked command file table usage

We're not consistent in how the file table is grabbed and assigned if we
have a command linked that requires the use of it.

Add ->file_table to the io_op_defs[] array, and use that to determine
when to grab the table instead of having the handlers set it if they
need to defer. This also means we can kill the IO_WQ_WORK_NEEDS_FILES
flag. We always initialize work->files, so io-wq can just check for
that.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8bcf0538e2e1..0d8d0e217847 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -603,6 +603,8 @@ struct io_op_def {
 	unsigned		unbound_nonreg_file : 1;
 	/* opcode is not supported by this kernel */
 	unsigned		not_supported : 1;
+	/* needs file table */
+	unsigned		file_table : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -661,6 +663,7 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
+		.file_table		= 1,
 	},
 	[IORING_OP_ASYNC_CANCEL] = {},
 	[IORING_OP_LINK_TIMEOUT] = {
@@ -679,12 +682,15 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_OPENAT] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
+		.file_table		= 1,
 	},
 	[IORING_OP_CLOSE] = {
 		.needs_file		= 1,
+		.file_table		= 1,
 	},
 	[IORING_OP_FILES_UPDATE] = {
 		.needs_mm		= 1,
+		.file_table		= 1,
 	},
 	[IORING_OP_STATX] = {
 		.needs_mm		= 1,
@@ -720,6 +726,7 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_OPENAT2] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
+		.file_table		= 1,
 	},
 };
 
@@ -732,6 +739,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_files_update *ip,
 				 unsigned nr_args);
+static int io_grab_files(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -2568,10 +2576,8 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
 	struct file *file;
 	int ret;
 
-	if (force_nonblock) {
-		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+	if (force_nonblock)
 		return -EAGAIN;
-	}
 
 	ret = build_open_flags(&req->open.how, &op);
 	if (ret)
@@ -2797,10 +2803,8 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
 		return ret;
 
 	/* if the file has a flush method, be safe and punt to async */
-	if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) {
-		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+	if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
 		goto eagain;
-	}
 
 	/*
 	 * No ->flush(), safely close from here and just punt the
@@ -3244,7 +3248,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
 	ret = __io_accept(req, nxt, force_nonblock);
 	if (ret == -EAGAIN && force_nonblock) {
 		req->work.func = io_accept_finish;
-		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
 		io_put_req(req);
 		return -EAGAIN;
 	}
@@ -3967,10 +3970,8 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock)
 	struct io_uring_files_update up;
 	int ret;
 
-	if (force_nonblock) {
-		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+	if (force_nonblock)
 		return -EAGAIN;
-	}
 
 	up.offset = req->files_update.offset;
 	up.fds = req->files_update.arg;
@@ -3991,6 +3992,12 @@ static int io_req_defer_prep(struct io_kiocb *req,
 {
 	ssize_t ret = 0;
 
+	if (io_op_defs[req->opcode].file_table) {
+		ret = io_grab_files(req);
+		if (unlikely(ret))
+			return ret;
+	}
+
 	io_req_work_grab_env(req, &io_op_defs[req->opcode]);
 
 	switch (req->opcode) {
@@ -4424,6 +4431,8 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (req->work.files)
+		return 0;
 	if (!ctx->ring_file)
 		return -EBADF;
 
@@ -4542,7 +4551,7 @@ again:
 	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
 	    (req->flags & REQ_F_MUST_PUNT))) {
 punt:
-		if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+		if (io_op_defs[req->opcode].file_table) {
 			ret = io_grab_files(req);
 			if (ret)
 				goto err;
-- 
cgit 


From 3e4827b05d2ac2d377ed136a52829ec46787bf4b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jan 2020 15:18:09 -0700
Subject: io_uring: add support for epoll_ctl(2)

This adds IORING_OP_EPOLL_CTL, which can perform the same work as the
epoll_ctl(2) system call.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'fs/io_uring.c')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0d8d0e217847..c5ca84a305d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -74,6 +74,7 @@
 #include <linux/namei.h>
 #include <linux/fsnotify.h>
 #include <linux/fadvise.h>
+#include <linux/eventpoll.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -423,6 +424,14 @@ struct io_madvise {
 	u32				advice;
 };
 
+struct io_epoll {
+	struct file			*file;
+	int				epfd;
+	int				op;
+	int				fd;
+	struct epoll_event		event;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -536,6 +545,7 @@ struct io_kiocb {
 		struct io_files_update	files_update;
 		struct io_fadvise	fadvise;
 		struct io_madvise	madvise;
+		struct io_epoll		epoll;
 	};
 
 	struct io_async_ctx		*io;
@@ -728,6 +738,10 @@ static const struct io_op_def io_op_defs[] = {
 		.fd_non_neg		= 1,
 		.file_table		= 1,
 	},
+	[IORING_OP_EPOLL_CTL] = {
+		.unbound_nonreg_file	= 1,
+		.file_table		= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2611,6 +2625,52 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
 	return io_openat2(req, nxt, force_nonblock);
 }
 
+static int io_epoll_ctl_prep(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_EPOLL)
+	if (sqe->ioprio || sqe->buf_index)
+		return -EINVAL;
+
+	req->epoll.epfd = READ_ONCE(sqe->fd);
+	req->epoll.op = READ_ONCE(sqe->len);
+	req->epoll.fd = READ_ONCE(sqe->off);
+
+	if (ep_op_has_event(req->epoll.op)) {
+		struct epoll_event __user *ev;
+
+		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
+		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
+			return -EFAULT;
+	}
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
+			bool force_nonblock)
+{
+#if defined(CONFIG_EPOLL)
+	struct io_epoll *ie = &req->epoll;
+	int ret;
+
+	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
+	if (force_nonblock && ret == -EAGAIN)
+		return -EAGAIN;
+
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
@@ -4075,6 +4135,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_OPENAT2:
 		ret = io_openat2_prep(req, sqe);
 		break;
+	case IORING_OP_EPOLL_CTL:
+		ret = io_epoll_ctl_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -4303,6 +4366,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_openat2(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_EPOLL_CTL:
+		if (sqe) {
+			ret = io_epoll_ctl_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_epoll_ctl(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit