summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/lzo.c36
-rw-r--r--fs/btrfs/zlib.c36
-rw-r--r--fs/btrfs/zstd.c27
-rw-r--r--fs/cachefiles/rdwr.c16
-rw-r--r--fs/ceph/caps.c12
-rw-r--r--fs/ceph/file.c1
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/mds_client.c17
-rw-r--r--fs/ceph/super.c17
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c87
-rw-r--r--fs/fuse/virtio_fs.c12
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/io-wq.c65
-rw-r--r--fs/io-wq.h59
-rw-r--r--fs/io_uring.c1768
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/kernel_read_file.c2
-rw-r--r--fs/ksmbd/auth.c16
-rw-r--r--fs/ksmbd/connection.c2
-rw-r--r--fs/ksmbd/ksmbd_netlink.h2
-rw-r--r--fs/ksmbd/mgmt/user_config.c2
-rw-r--r--fs/ksmbd/mgmt/user_config.h1
-rw-r--r--fs/ksmbd/smb2misc.c55
-rw-r--r--fs/ksmbd/smb2ops.c3
-rw-r--r--fs/ksmbd/smb2pdu.c346
-rw-r--r--fs/ksmbd/smb2pdu.h2
-rw-r--r--fs/ksmbd/transport_ipc.c3
-rw-r--r--fs/ksmbd/transport_ipc.h2
-rw-r--r--fs/ksmbd/transport_rdma.c21
-rw-r--r--fs/ksmbd/vfs.c2
-rw-r--r--fs/ksmbd/vfs.h2
-rw-r--r--fs/locks.c161
-rw-r--r--fs/namei.c4
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/ocfs2/alloc.c46
-rw-r--r--fs/ocfs2/suballoc.c22
-rw-r--r--fs/ocfs2/super.c14
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/userfaultfd.c12
47 files changed, 1566 insertions, 1356 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f24370f5c774..8b1d9c2f6bec 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -861,7 +861,8 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = thp_head(vmf->page);
+ struct folio *folio = page_folio(vmf->page);
+ struct page *page = &folio->page;
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -884,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
goto out;
#endif
- if (wait_on_page_writeback_killable(page))
+ if (folio_wait_writeback_killable(folio))
goto out;
if (lock_page_killable(page) < 0)
@@ -894,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- if (wait_on_page_writeback_killable(page) < 0) {
- unlock_page(page);
+ if (folio_wait_writeback_killable(folio) < 0) {
+ folio_unlock(folio);
goto out;
}
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 16b5fca0626e..54c1f8b8b075 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -358,7 +358,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
qstr.len = strlen(p);
offset = p - name;
}
- qstr.hash = full_name_hash(dentry, name, qstr.len);
+ qstr.hash = full_name_hash(dentry, qstr.name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
kfree(name);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ddc4f5436cc9..6c7eb80220ca 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -173,9 +173,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = page_address(page);
+ kaddr = kmap_atomic(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
+ kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 04090ba0ef73..954b53a90f04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -288,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = page_address(cpage);
+ kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_atomic(kaddr);
i++;
ptr += cur_size;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index c25dfd1a8a54..3dbe6eb5fda7 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,7 +141,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = 0;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
/*
* store the size of all chunks of compressed data in
@@ -152,7 +152,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
out_offset = LZO_LEN;
tot_out = LZO_LEN;
pages[0] = out_page;
@@ -210,6 +210,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
if (out_len == 0 && tot_in >= len)
break;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -221,7 +222,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages++] = out_page;
pg_bytes_left = PAGE_SIZE;
@@ -243,11 +244,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
break;
bytes_left = len - tot_in;
+ kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
in_len = min(bytes_left, PAGE_SIZE);
}
@@ -257,17 +259,22 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
- sizes_ptr = page_address(pages[0]);
+ sizes_ptr = kmap_local_page(pages[0]);
write_compress_length(sizes_ptr, tot_out);
+ kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;
*total_in = tot_in;
out:
*out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
return ret;
}
@@ -283,6 +290,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
+ char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -290,9 +298,11 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ kaddr = kmap(cur_page);
memcpy(dest + *cur_in - orig_in,
- page_address(cur_page) + offset_in_page(*cur_in),
+ kaddr + offset_in_page(*cur_in),
copy_len);
+ kunmap(cur_page);
*cur_in += copy_len;
}
@@ -303,6 +313,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
u32 len_in;
@@ -311,7 +322,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ kaddr = kmap(cb->compressed_pages[0]);
+ len_in = read_compress_length(kaddr);
+ kunmap(cb->compressed_pages[0]);
cur_in += LZO_LEN;
/*
@@ -344,9 +357,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+ kaddr = kmap(cur_page);
ASSERT(cur_page);
- seg_len = read_compress_length(page_address(cur_page) +
- offset_in_page(cur_in));
+ seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
cur_in += LZO_LEN;
/* Copy the compressed segment payload into workspace */
@@ -431,7 +444,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = page_address(dest_page);
+ kaddr = kmap_local_page(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -441,6 +454,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
+ kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8afa90074891..767a0c6c9694 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = page_address(in_page);
+ data_in = kmap(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
+ cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (in_page)
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
return ret;
}
@@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
+ kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
-
+ kunmap(pages_in[page_in_index]);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = page_address(pages_in[page_in_index]);
+ data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
done:
zlib_inflateEnd(&workspace->strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
if (!ret)
zero_fill_bio(cb->orig_bio);
return ret;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 56dce9f00988..f06b68040352 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
+ kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = page_address(in_page);
+ workspace->in_buf.src = kmap(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
+ kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
@@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.dst = kmap(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page)
+ if (in_page) {
+ kunmap(in_page);
put_page(in_page);
+ }
+ if (out_page)
+ kunmap(out_page);
return ret;
}
@@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- page_in_index++;
+ kunmap(pages_in[page_in_index++]);
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = page_address(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
zero_fill_bio(cb->orig_bio);
done:
+ if (workspace->in_buf.src)
+ kunmap(pages_in[page_in_index]);
return ret;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8ffc40e84a59..fcf4f3b72923 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
struct wait_page_key *key = _key;
- struct page *page = wait->private;
+ struct folio *folio = wait->private;
ASSERT(key);
_enter("{%lu},%u,%d,{%p,%u}",
monitor->netfs_page->index, mode, sync,
- key->page, key->bit_nr);
+ key->folio, key->bit_nr);
- if (key->page != page || key->bit_nr != PG_locked)
+ if (key->folio != folio || key->bit_nr != PG_locked)
return 0;
- _debug("--- monitor %p %lx ---", page, page->flags);
+ _debug("--- monitor %p %lx ---", folio, folio->flags);
- if (!PageUptodate(page) && !PageError(page)) {
+ if (!folio_test_uptodate(folio) && !folio_test_error(folio)) {
/* unlocked, not uptodate and not erronous? */
_debug("page probably truncated");
}
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
put_page(backpage2);
INIT_LIST_HEAD(&monitor->op_link);
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
if (trylock_page(backpage)) {
ret = -EIO;
@@ -294,7 +294,7 @@ monitor_backing_page:
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3e42d0466521..8f537f1d9d1d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2330,7 +2330,6 @@ retry:
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 flush_tid;
@@ -2365,14 +2364,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
- if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
- spin_lock(&file->f_lock);
- err = errseq_check_and_advance(&ci->i_meta_err,
- &fi->meta_err);
- spin_unlock(&file->f_lock);
- if (err < 0)
- ret = err;
- }
+ err = file_check_and_advance_wb_err(file);
+ if (err < 0)
+ ret = err;
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d16fd2d5fd42..e61018d9764e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -233,7 +233,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->meta_err = errseq_sample(&ci->i_meta_err);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
return 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2df1e1284451..1c7574105478 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -541,8 +541,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ceph_fscache_inode_init(ci);
- ci->i_meta_err = 0;
-
return &ci->vfs_inode;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index bdeb271f47d9..d8c31069fbf2 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /* No mandatory locks */
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
dout("ceph_flock, fl_file: %p\n", fl->fl_file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7cad180d6deb..d64413adc0fd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1493,7 +1493,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req;
struct rb_node *p;
- struct ceph_inode_info *ci;
dout("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
@@ -1502,16 +1501,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request, r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid);
- if (req->r_target_inode) {
- /* dropping unsafe change of inode's attributes */
- ci = ceph_inode(req->r_target_inode);
- errseq_set(&ci->i_meta_err, -EIO);
- }
- if (req->r_unsafe_dir) {
- /* dropping unsafe directory operation */
- ci = ceph_inode(req->r_unsafe_dir);
- errseq_set(&ci->i_meta_err, -EIO);
- }
+ if (req->r_target_inode)
+ mapping_set_error(req->r_target_inode->i_mapping, -EIO);
+ if (req->r_unsafe_dir)
+ mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
__unregister_request(mdsc, req);
}
/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1678,7 +1671,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (dirty_dropped) {
- errseq_set(&ci->i_meta_err, -EIO);
+ mapping_set_error(inode->i_mapping, -EIO);
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_wr_ref == 0 &&
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9b1b7f4cfdd4..fd8742bae847 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1002,16 +1002,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *other = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
dout("ceph_compare_super %p\n", sb);
- if (compare_mount_options(fsopt, opt, other)) {
+ if (compare_mount_options(fsopt, opt, fsc)) {
dout("monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
- ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
dout("fsid doesn't match\n");
return 0;
}
@@ -1019,6 +1019,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
dout("flags differ\n");
return 0;
}
+
+ if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
+ dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+ return 0;
+ }
+
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ dout("client has been forcibly unmounted\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a40eb14c282a..14f951cd5b61 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -429,8 +429,6 @@ struct ceph_inode_info {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_cookie *fscache;
#endif
- errseq_t i_meta_err;
-
struct inode vfs_inode; /* at end */
};
@@ -774,7 +772,6 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock;
struct list_head rw_contexts;
- errseq_t meta_err;
u32 filp_gen;
atomic_t num_locks;
};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81ec192ce067..4124a89a1a5d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- blk_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
* If we are expecting writeback progress we must submit plugged IO.
*/
if (blk_needs_flush_plug(current))
- blk_schedule_flush_plug(current);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 319596df5dc6..f55f9f94b1a4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1121,6 +1121,9 @@ int fuse_init_fs_context_submount(struct fs_context *fsc);
*/
void fuse_conn_destroy(struct fuse_mount *fm);
+/* Drop the connection and free the fuse mount */
+void fuse_mount_destroy(struct fuse_mount *fm);
+
/**
* Add connection to control filesystem
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 36cd03114b6d..12d49a1914e8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -457,14 +457,6 @@ static void fuse_send_destroy(struct fuse_mount *fm)
}
}
-static void fuse_put_super(struct super_block *sb)
-{
- struct fuse_mount *fm = get_fuse_mount_super(sb);
-
- fuse_conn_put(fm->fc);
- kfree(fm);
-}
-
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
{
stbuf->f_type = FUSE_SUPER_MAGIC;
@@ -1003,7 +995,6 @@ static const struct super_operations fuse_super_operations = {
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
- .put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
@@ -1424,20 +1415,17 @@ static int fuse_get_tree_submount(struct fs_context *fsc)
if (!fm)
return -ENOMEM;
+ fm->fc = fuse_conn_get(fc);
fsc->s_fs_info = fm;
sb = sget_fc(fsc, NULL, set_anon_super_fc);
- if (IS_ERR(sb)) {
- kfree(fm);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (IS_ERR(sb))
return PTR_ERR(sb);
- }
- fm->fc = fuse_conn_get(fc);
/* Initialize superblock, making @mp_fi its root */
err = fuse_fill_super_submount(sb, mp_fi);
if (err) {
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
@@ -1569,8 +1557,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
int err;
- struct fuse_conn *fc;
- struct fuse_mount *fm;
if (!ctx->file || !ctx->rootmode_present ||
!ctx->user_id_present || !ctx->group_id_present)
@@ -1580,42 +1566,18 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
* Require mount to happen from the same user namespace which
* opened /dev/fuse to prevent potential attacks.
*/
- err = -EINVAL;
if ((ctx->file->f_op != &fuse_dev_operations) ||
(ctx->file->f_cred->user_ns != sb->s_user_ns))
- goto err;
+ return -EINVAL;
ctx->fudptr = &ctx->file->private_data;
- fc = kmalloc(sizeof(*fc), GFP_KERNEL);
- err = -ENOMEM;
- if (!fc)
- goto err;
-
- fm = kzalloc(sizeof(*fm), GFP_KERNEL);
- if (!fm) {
- kfree(fc);
- goto err;
- }
-
- fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
- fc->release = fuse_free_conn;
-
- sb->s_fs_info = fm;
-
err = fuse_fill_super_common(sb, ctx);
if (err)
- goto err_put_conn;
+ return err;
/* file->private_data shall be visible on all CPUs after this */
smp_mb();
fuse_send_init(get_fuse_mount_super(sb));
return 0;
-
- err_put_conn:
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
- err:
- return err;
}
/*
@@ -1637,22 +1599,40 @@ static int fuse_get_tree(struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct super_block *sb;
int err;
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL);
+ fc->release = fuse_free_conn;
+
+ fsc->s_fs_info = fm;
+
if (ctx->fd_present)
ctx->file = fget(ctx->fd);
if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
err = get_tree_bdev(fsc, fuse_fill_super);
- goto out_fput;
+ goto out;
}
/*
* While block dev mount can be initialized with a dummy device fd
* (found by device name), normal fuse mounts can't
*/
+ err = -EINVAL;
if (!ctx->file)
- return -EINVAL;
+ goto out;
/*
* Allow creating a fuse mount with an already initialized fuse
@@ -1668,7 +1648,9 @@ static int fuse_get_tree(struct fs_context *fsc)
} else {
err = get_tree_nodev(fsc, fuse_fill_super);
}
-out_fput:
+out:
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (ctx->file)
fput(ctx->file);
return err;
@@ -1747,17 +1729,25 @@ static void fuse_sb_destroy(struct super_block *sb)
struct fuse_mount *fm = get_fuse_mount_super(sb);
bool last;
- if (fm) {
+ if (sb->s_root) {
last = fuse_mount_remove(fm);
if (last)
fuse_conn_destroy(fm);
}
}
+void fuse_mount_destroy(struct fuse_mount *fm)
+{
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+}
+EXPORT_SYMBOL(fuse_mount_destroy);
+
static void fuse_kill_sb_anon(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_anon_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuse_fs_type = {
@@ -1775,6 +1765,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_block_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
}
static struct file_system_type fuseblk_fs_type = {
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 0ad89c6629d7..94fc874f5de7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1394,12 +1394,13 @@ static void virtio_kill_sb(struct super_block *sb)
bool last;
/* If mount failed, we can still be called without any fc */
- if (fm) {
+ if (sb->s_root) {
last = fuse_mount_remove(fm);
if (last)
virtio_fs_conn_destroy(fm);
}
kill_anon_super(sb);
+ fuse_mount_destroy(fm);
}
static int virtio_fs_test_super(struct super_block *sb,
@@ -1455,19 +1456,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
- if (fsc->s_fs_info) {
- fuse_conn_put(fc);
- kfree(fm);
- }
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
err = virtio_fs_fill_super(sb, fsc);
if (err) {
- fuse_conn_put(fc);
- kfree(fm);
- sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 635f0e3f10ec..5436a688157a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1338,8 +1338,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
if (fl->fl_type == F_UNLCK) {
do_unflock(file, fl);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 5bf8aa81715e..38b33ad9e8cf 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
static bool io_worker_get(struct io_worker *worker)
{
@@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&worker->wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock(&worker->wqe->lock);
+ io_worker_ref_put(wq);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+ struct io_worker *worker;
+
+ if (cb->func != create_worker_cb)
+ return false;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker == data;
+}
+
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
+ while (1) {
+ struct callback_head *cb = task_work_cancel_match(wq->task,
+ io_task_worker_match, worker);
+
+ if (!cb)
+ break;
+ io_worker_cancel_cb(worker);
+ }
+
+ io_worker_release(worker);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
@@ -253,7 +288,7 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
pr_warn_once("io-wq is not configured for unbound workers");
raw_spin_lock(&wqe->lock);
- if (acct->nr_workers == acct->max_workers) {
+ if (acct->nr_workers >= acct->max_workers) {
raw_spin_unlock(&wqe->lock);
return true;
}
@@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+ clear_bit_unlock(0, &worker->create_state);
return true;
+ }
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
@@ -716,11 +753,8 @@ static void io_workqueue_create(struct work_struct *work)
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
- }
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
- struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
- acct = io_wqe_get_acct(worker);
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ io_worker_cancel_cb(worker);
}
rcu_read_lock();
@@ -1291,15 +1317,18 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
rcu_read_lock();
for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
struct io_wqe_acct *acct;
+ raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- acct = &wq->wqes[node]->acct[i];
+ acct = &wqe->acct[i];
prev = max_t(int, acct->max_workers, prev);
if (new_count[i])
acct->max_workers = new_count[i];
new_count[i] = prev;
}
+ raw_spin_unlock(&wqe->lock);
}
rcu_read_unlock();
return 0;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bf5c4c533760..41bf37674a49 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -29,6 +29,17 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv) \
+ for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+} while (0)
+
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ node->next = list->first;
+ if (!node->next)
+ list->last = node;
+ WRITE_ONCE(list->first, node);
+}
+
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
last->next = NULL;
}
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ list->last->next = to->next;
+ to->next = list->first;
+ INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ if (!wq_list_empty(list)) {
+ __wq_list_splice(list, to);
+ return true;
+ }
+ return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_node *stack)
+{
+ node->next = stack->next;
+ stack->next = node;
+}
+
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
wq_list_cut(list, node, prev);
}
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+ struct io_wq_work_node *node = stack->next;
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
- (list)->last = NULL; \
-} while (0)
+ stack->next = node->next;
+ return node;
+}
struct io_wq_work {
struct io_wq_work_node list;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d4631a55a692..ca10dbb01201 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -103,11 +103,14 @@
#define IORING_MAX_REG_BUFFERS (1U << 14)
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+ REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -195,8 +198,10 @@ struct io_rings {
};
enum io_uring_cmd_flags {
- IO_URING_F_NONBLOCK = 1,
- IO_URING_F_COMPLETE_DEFER = 2,
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
@@ -305,26 +310,16 @@ struct io_submit_link {
};
struct io_submit_state {
- struct blk_plug plug;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
struct io_submit_link link;
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_REQ_CACHE_SIZE];
- unsigned int free_reqs;
-
bool plug_started;
-
- /*
- * Batch completion logic
- */
- struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
- unsigned int compl_nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-
- unsigned int ios_left;
+ bool need_plug;
+ unsigned short submit_nr;
+ struct blk_plug plug;
};
struct io_ring_ctx {
@@ -368,6 +363,7 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
@@ -384,7 +380,7 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
+ struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
@@ -399,7 +395,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct eventfd_ctx *cq_ev_fd;
- struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -417,7 +412,7 @@ struct io_ring_ctx {
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head iopoll_list;
+ struct io_wq_work_list iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
@@ -456,6 +451,8 @@ struct io_ring_ctx {
struct work_struct exit_work;
struct list_head tctx_list;
struct completion ref_comp;
+ u32 iowq_limits[2];
+ bool iowq_limits_set;
};
};
@@ -578,7 +575,6 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
- struct io_buffer *kbuf;
};
struct io_open {
@@ -690,11 +686,6 @@ struct io_hardlink {
int flags;
};
-struct io_completion {
- struct file *file;
- u32 cflags;
-};
-
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -708,11 +699,15 @@ struct io_async_msghdr {
struct sockaddr_storage addr;
};
-struct io_async_rw {
- struct iovec fast_iov[UIO_FASTIOV];
- const struct iovec *free_iovec;
+struct io_rw_state {
struct iov_iter iter;
struct iov_iter_state iter_state;
+ struct iovec fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+ struct io_rw_state s;
+ const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -739,9 +734,9 @@ enum {
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_NOWAIT_READ_BIT,
- REQ_F_NOWAIT_WRITE_BIT,
+ REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -782,10 +777,8 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads */
- REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
- /* supports async writes */
- REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
@@ -794,6 +787,8 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
};
struct async_poll {
@@ -850,39 +845,41 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
- /* use only after cleaning per-op data, see io_clean_op() */
- struct io_completion compl;
};
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
-
u16 buf_index;
+ unsigned int flags;
+
+ u64 user_data;
u32 result;
+ u32 cflags;
struct io_ring_ctx *ctx;
- unsigned int flags;
- atomic_t refs;
struct task_struct *task;
- u64 user_data;
- struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
- /* used with ctx->iopoll_list with reads/writes */
- struct list_head inflight_entry;
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ atomic_t refs;
+ struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
+ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
struct io_wq_work work;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
-
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
};
struct io_tctx_node {
@@ -900,12 +897,12 @@ struct io_defer_entry {
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
@@ -913,8 +910,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
unsigned needs_async_setup : 1;
- /* should block plug */
- unsigned plug : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
@@ -1078,7 +1075,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags);
+ s32 res, u32 cflags);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
@@ -1093,7 +1090,7 @@ static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1165,6 +1162,12 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+ __io_submit_flush_completions(ctx);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1178,13 +1181,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
__io_req_set_refcount(req, 1);
}
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH 100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+ if (ref) {
+ if (ref == &ctx->rsrc_node->refs)
+ ctx->rsrc_cached_refs++;
+ else
+ percpu_ref_put(ref);
+ }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (ctx->rsrc_cached_refs) {
+ percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ ctx->rsrc_cached_refs = 0;
+ }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+ percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- percpu_ref_get(req->fixed_rsrc_refs);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
}
}
@@ -1217,6 +1259,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
return false;
}
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ASYNC_DATA;
+}
+
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
@@ -1228,7 +1275,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
req->result = res;
}
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1240,7 +1287,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
@@ -1253,15 +1300,13 @@ static void io_fallback_req_func(struct work_struct *work)
req->io_task_work.func(req, &locked);
if (locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
-
}
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
@@ -1298,7 +1343,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
@@ -1307,7 +1351,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
- INIT_LIST_HEAD(&ctx->iopoll_list);
+ INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1316,9 +1360,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.free_list);
- INIT_LIST_HEAD(&ctx->locked_free_list);
+ ctx->submit_state.free_list.next = NULL;
+ INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+ INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -1346,21 +1391,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
- return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+ return req->flags & REQ_F_FIXED_FILE;
}
-static void io_req_track_inflight(struct io_kiocb *req)
+static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
@@ -1368,11 +1408,6 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static inline void io_unprep_linked_timeout(struct io_kiocb *req)
-{
- req->flags &= ~REQ_F_LINK_TIMEOUT;
-}
-
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!req->link))
@@ -1443,15 +1478,19 @@ static void io_prep_async_link(struct io_kiocb *req)
}
}
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+ struct io_submit_state *state = &req->ctx->submit_state;
+
+ wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
- /* must not take the lock, NULL it as a precaution */
- locked = NULL;
-
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1492,7 +1531,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
}
}
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1506,7 +1545,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -1539,7 +1578,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
@@ -1609,12 +1648,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1628,8 +1663,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1725,7 +1758,7 @@ static inline void io_get_task_refs(int nr)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_overflow_cqe *ocqe;
@@ -1753,7 +1786,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_uring_cqe *cqe;
@@ -1776,13 +1809,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
/* not as hot to bloat with inlining */
static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
return __io_cqring_fill_event(ctx, user_data, res, cflags);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1801,40 +1834,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
req->link = NULL;
}
}
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
- } else {
- if (!percpu_ref_tryget(&ctx->refs))
- req = NULL;
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
-
- if (req) {
- io_cqring_ev_posted(ctx);
- percpu_ref_put(&ctx->refs);
- }
-}
-
-static inline bool io_req_needs_clean(struct io_kiocb *req)
-{
- return req->flags & IO_REQ_CLEAN_FLAGS;
+ io_cqring_ev_posted(ctx);
}
-static void io_req_complete_state(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- if (io_req_needs_clean(req))
- io_clean_op(req);
req->result = res;
- req->compl.cflags = cflags;
+ req->cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- long res, unsigned cflags)
+ s32 res, u32 cflags)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
@@ -1842,12 +1862,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
__io_req_complete(req, 0, res, 0);
}
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
io_req_complete_post(req, res, 0);
@@ -1881,7 +1901,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &state->free_list);
+ wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
@@ -1890,7 +1910,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
@@ -1899,20 +1918,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, state);
-
- nr = state->free_reqs;
- while (!list_empty(&state->free_list)) {
- struct io_kiocb *req = list_first_entry(&state->free_list,
- struct io_kiocb, inflight_entry);
-
- list_del(&req->inflight_entry);
- state->reqs[nr++] = req;
- if (nr == ARRAY_SIZE(state->reqs))
- break;
- }
-
- state->free_reqs = nr;
- return nr != 0;
+ return !!state->free_list.next;
}
/*
@@ -1921,38 +1927,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ void *reqs[IO_REQ_ALLOC_BATCH];
+ struct io_kiocb *req;
int ret, i;
- BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
- if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
- goto got_req;
+ if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+ return true;
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
+ reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!reqs[0])
+ return false;
ret = 1;
}
- for (i = 0; i < ret; i++)
- io_preinit_req(state->reqs[i], ctx);
- state->free_reqs = ret;
-got_req:
- state->free_reqs--;
- return state->reqs[state->free_reqs];
+ percpu_ref_get_many(&ctx->refs, ret);
+ for (i = 0; i < ret; i++) {
+ req = reqs[i];
+
+ io_preinit_req(req, ctx);
+ wq_stack_add_head(&req->comp_list, &state->free_list);
+ }
+ return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(!ctx->submit_state.free_list.next))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
}
static inline void io_put_file(struct file *file)
@@ -1961,35 +1983,28 @@ static inline void io_put_file(struct file *file)
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
- if (io_req_needs_clean(req))
+ if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
}
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
-
- percpu_ref_put(&ctx->refs);
}
static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2075,47 +2090,45 @@ static bool io_disarm_next(struct io_kiocb *req)
return posted;
}
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ spin_lock(&ctx->completion_lock);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(req->ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (posted)
+ io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+ return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & IO_DISARM_MASK) {
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(req->ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- }
+ if (unlikely(req->flags & IO_DISARM_MASK))
+ __io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- return __io_req_find_next(req);
-}
-
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (*locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
@@ -2132,7 +2145,7 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
- if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ if (!tctx->task_list.first && locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
@@ -2195,8 +2208,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (!task_work_add(tsk, &tctx->task_work, notify)) {
- wake_up_process(tsk);
+ if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+ if (notify == TWA_NONE)
+ wake_up_process(tsk);
return;
}
@@ -2274,77 +2288,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
io_free_req(req);
}
-struct req_batch {
- struct task_struct *task;
- int task_refs;
- int ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
+ __must_hold(&ctx->uring_lock)
{
- rb->task_refs = 0;
- rb->ctx_refs = 0;
- rb->task = NULL;
-}
+ struct task_struct *task = NULL;
+ int task_refs = 0;
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- if (rb->ctx_refs)
- percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
-}
+ do {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
- struct io_submit_state *state)
-{
- io_queue_next(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
- if (req->task != rb->task) {
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
- }
- rb->task_refs++;
- rb->ctx_refs++;
+ io_req_put_rsrc_locked(req, ctx);
+ io_queue_next(req);
+ io_dismantle_req(req);
- if (state->free_reqs != ARRAY_SIZE(state->reqs))
- state->reqs[state->free_reqs++] = req;
- else
- list_add(&req->inflight_entry, &state->free_list);
+ if (req->task != task) {
+ if (task)
+ io_put_task(task, task_refs);
+ task = req->task;
+ task_refs = 0;
+ }
+ task_refs++;
+ node = req->comp_list.next;
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ } while (node);
+
+ if (task)
+ io_put_task(task, task_refs);
}
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
- int i, nr = state->compl_nr;
- struct req_batch rb;
spin_lock(&ctx->completion_lock);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
+ wq_list_for_each(node, prev, &state->compl_reqs) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
__io_cqring_fill_event(ctx, req->user_data, req->result,
- req->compl.cflags);
+ req->cflags);
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_init_req_batch(&rb);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_req_free_batch_finish(ctx, &rb);
- state->compl_nr = 0;
+ io_free_batch_list(ctx, state->compl_reqs.first);
+ INIT_WQ_LIST(&state->compl_reqs);
}
/*
@@ -2404,12 +2403,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
- struct io_buffer *kbuf;
-
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- return io_put_kbuf(req, kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static inline bool io_run_task_work(void)
@@ -2423,52 +2419,22 @@ static inline bool io_run_task_work(void)
return false;
}
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
- struct req_batch rb;
- struct io_kiocb *req;
-
- /* order with ->result store in io_complete_rw_iopoll() */
- smp_rmb();
-
- io_init_req_batch(&rb);
- while (!list_empty(done)) {
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- io_put_rw_kbuf(req));
- (*nr_events)++;
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- struct io_kiocb *req, *tmp;
+ struct io_wq_work_node *pos, *start, *prev;
unsigned int poll_flags = BLK_POLL_NOSLEEP;
DEFINE_IO_COMP_BATCH(iob);
- LIST_HEAD(done);
+ int nr_events = 0;
/*
* Only spin for completions if we don't have multiple devices hanging
- * off our complete list, and we're under the requested amount.
+ * off our complete list.
*/
- if (ctx->poll_multi_queue || *nr_events >= min)
+ if (ctx->poll_multi_queue || force_nonspin)
poll_flags |= BLK_POLL_ONESHOT;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ wq_list_for_each(pos, start, &ctx->iopoll_list) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct kiocb *kiocb = &req->rw.kiocb;
int ret;
@@ -2477,11 +2443,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
- continue;
- }
- if (!list_empty(&done))
+ if (READ_ONCE(req->iopoll_completed))
break;
ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
@@ -2493,34 +2455,50 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
/* iopoll may have completed current req */
if (!rq_list_empty(iob.req_list) ||
READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ break;
}
if (!rq_list_empty(iob.req_list))
iob.complete(&iob);
- if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ else if (!pos)
+ return 0;
- return 0;
+ prev = start;
+ wq_list_for_each_resume(pos, prev) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+
+ /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+ if (!smp_load_acquire(&req->iopoll_completed))
+ break;
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ io_put_rw_kbuf(req));
+ nr_events++;
+ }
+
+ if (unlikely(!nr_events))
+ return 0;
+
+ io_commit_cqring(ctx);
+ io_cqring_ev_posted_iopoll(ctx);
+ pos = start ? start->next : ctx->iopoll_list.first;
+ wq_list_cut(&ctx->iopoll_list, prev, start);
+ io_free_batch_list(ctx, pos);
+ return nr_events;
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->iopoll_list)) {
- unsigned int nr_events = 0;
-
- io_do_iopoll(ctx, &nr_events, 0);
-
+ while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
- if (nr_events == 0)
+ if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
@@ -2567,7 +2545,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
u32 tail = ctx->cached_cq_tail;
mutex_unlock(&ctx->uring_lock);
@@ -2576,11 +2554,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
- list_empty(&ctx->iopoll_list))
+ wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
- } while (!ret && nr_events < min && !need_resched());
+ ret = io_do_iopoll(ctx, !min);
+ if (ret < 0)
+ break;
+ nr_events += ret;
+ ret = 0;
+ } while (nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -2605,9 +2587,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *rw = req->async_data;
- if (!rw)
+ if (!req_has_async_data(req))
return !io_req_prep_async(req);
- iov_iter_restore(&rw->iter, &rw->iter_state);
+ iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
return true;
}
@@ -2651,7 +2633,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (req->rw.kiocb.ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result) {
+ if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
@@ -2666,16 +2648,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
unsigned int cflags = io_put_rw_kbuf(req);
- long res = req->result;
+ int res = req->result;
if (*locked) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
io_req_complete_state(req, res, cflags);
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
+ io_req_add_compl_list(req);
} else {
io_req_complete_post(req, res, cflags);
}
@@ -2711,12 +2688,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
req->flags |= REQ_F_REISSUE;
return;
}
+ req->result = res;
}
- WRITE_ONCE(req->result, res);
- /* order with io_iopoll_complete() checking ->result */
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
+ /* order with io_iopoll_complete() checking ->iopoll_completed */
+ smp_store_release(&req->iopoll_completed, 1);
}
/*
@@ -2725,13 +2701,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- const bool in_async = io_wq_current_is_worker();
+ const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(in_async))
+ if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
@@ -2739,14 +2715,13 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
-
+ list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+ comp_list);
if (list_req->file != req->file)
ctx->poll_multi_queue = true;
}
@@ -2756,11 +2731,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
- if (unlikely(in_async)) {
+ if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
@@ -2785,10 +2760,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
- umode_t mode = file_inode(file)->i_mode;
-
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2808,28 +2781,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
+ return file->f_mode & FMODE_NOWAIT;
+}
- if (!(file->f_mode & FMODE_NOWAIT))
- return false;
-
- if (rw == READ)
- return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+ unsigned int res = 0;
- return file->f_op->write_iter != NULL;
+ if (S_ISREG(mode))
+ res |= FFS_ISREG;
+ if (__io_file_supports_nowait(file, mode))
+ res |= FFS_NOWAIT;
+ return res;
}
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
{
- if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
- return true;
- else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
- return true;
-
- return __io_file_supports_nowait(req->file, rw);
+ return req->flags & REQ_F_SUPPORT_NOWAIT;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2837,16 +2814,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
- req->flags |= REQ_F_ISREG;
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = file->f_pos;
}
- kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
return ret;
@@ -2857,22 +2833,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
*/
if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
req->flags |= REQ_F_NOWAIT;
- ioprio = READ_ONCE(sqe->ioprio);
- if (ioprio) {
- ret = ioprio_check_cap(ioprio);
- if (ret)
- return ret;
-
- kiocb->ki_ioprio = ioprio;
- } else
- kiocb->ki_ioprio = get_current_ioprio();
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) ||
- !kiocb->ki_filp->f_op->iopoll)
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
@@ -2884,12 +2849,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_complete = io_complete_rw;
}
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- req->imu = NULL;
- io_req_set_rsrc_node(req);
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ return ret;
+
+ kiocb->ki_ioprio = ioprio;
+ } else {
+ kiocb->ki_ioprio = get_current_ioprio();
}
+ req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2924,7 +2895,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_async_rw *io = req->async_data;
/* add previously done IO, if any */
- if (io && io->bytes_done > 0) {
+ if (req_has_async_data(req) && io->bytes_done > 0) {
if (ret < 0)
ret = io->bytes_done;
else
@@ -2947,7 +2918,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_ring_ctx *ctx = req->ctx;
req_set_fail(req);
- if (!(issue_flags & IO_URING_F_NONBLOCK)) {
+ if (issue_flags & IO_URING_F_UNLOCKED) {
mutex_lock(&ctx->uring_lock);
__io_req_complete(req, issue_flags, ret, cflags);
mutex_unlock(&ctx->uring_lock);
@@ -3018,13 +2989,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
if (likely(!imu)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
+ io_req_set_rsrc_node(req, ctx);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3051,10 +3024,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
}
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, struct io_buffer *kbuf,
- bool needs_lock)
+ int bgid, unsigned int issue_flags)
{
+ struct io_buffer *kbuf = req->kbuf;
struct io_buffer *head;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
@@ -3075,34 +3049,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
}
if (*len > kbuf->len)
*len = kbuf->len;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ req->kbuf = kbuf;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
-
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_buffer *kbuf;
u16 bgid;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ kbuf = io_buffer_select(req, len, bgid, issue_flags);
if (IS_ERR(kbuf))
return kbuf;
- req->rw.addr = (u64) (unsigned long) kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
@@ -3118,7 +3090,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3128,7 +3100,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
@@ -3140,7 +3112,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3149,12 +3121,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf;
+ struct io_buffer *kbuf = req->kbuf;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len;
return 0;
@@ -3164,52 +3135,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return io_compat_import(req, iov, needs_lock);
+ return io_compat_import(req, iov, issue_flags);
#endif
- return __io_iov_buffer_select(req, iov, needs_lock);
+ return __io_iov_buffer_select(req, iov, issue_flags);
}
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_rw_state *s,
+ unsigned int issue_flags)
{
- void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ struct iov_iter *iter = &s->iter;
u8 opcode = req->opcode;
+ struct iovec *iovec;
+ void __user *buf;
+ size_t sqe_len;
ssize_t ret;
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- *iovec = NULL;
- return io_import_fixed(req, rw, iter);
- }
+ BUILD_BUG_ON(ERR_PTR(0) != NULL);
+
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
+ return ERR_PTR(io_import_fixed(req, rw, iter));
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
+ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+ return ERR_PTR(-EINVAL);
+
+ buf = u64_to_user_ptr(req->rw.addr);
+ sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
if (IS_ERR(buf))
- return PTR_ERR(buf);
+ return ERR_CAST(buf);
req->rw.len = sqe_len;
}
- ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
- *iovec = NULL;
- return ret;
+ ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+ return ERR_PTR(ret);
}
+ iovec = s->fast_iov;
if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, *iovec, needs_lock);
+ ret = io_iov_buffer_select(req, iovec, issue_flags);
if (!ret)
- iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
- *iovec = NULL;
- return ret;
+ iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+ return ERR_PTR(ret);
}
- return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ERR_PTR(ret);
+ return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct io_rw_state *s,
+ unsigned int issue_flags)
+{
+ *iovec = __io_import_iovec(rw, req, s, issue_flags);
+ if (unlikely(IS_ERR(*iovec)))
+ return PTR_ERR(*iovec);
+
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ return 0;
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3234,7 +3225,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
*/
if (kiocb->ki_flags & IOCB_HIPRI)
return -EOPNOTSUPP;
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+ !(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
while (iov_iter_count(iter)) {
@@ -3280,7 +3272,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
{
struct io_async_rw *rw = req->async_data;
- memcpy(&rw->iter, iter, sizeof(*iter));
+ memcpy(&rw->s.iter, iter, sizeof(*iter));
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
@@ -3289,33 +3281,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!iovec) {
unsigned iov_off = 0;
- rw->iter.iov = rw->fast_iov;
+ rw->s.iter.iov = rw->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
- rw->iter.iov += iov_off;
+ rw->s.iter.iov += iov_off;
}
- if (rw->fast_iov != fast_iov)
- memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+ if (rw->s.fast_iov != fast_iov)
+ memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- return req->async_data == NULL;
+ if (req->async_data) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ return false;
+ }
+ return true;
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force)
+ struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (!req->async_data) {
+ if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
if (io_alloc_async_data(req)) {
@@ -3323,10 +3318,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
return -ENOMEM;
}
- io_req_map_rw(req, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
}
return 0;
}
@@ -3334,10 +3329,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
- struct iovec *iov = iorw->fast_iov;
+ struct iovec *iov;
int ret;
- ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ /* submission path, ->uring_lock should already be taken */
+ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0))
return ret;
@@ -3345,7 +3341,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
@@ -3353,11 +3348,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- return io_prep_rw(req, sqe, READ);
+ return io_prep_rw(req, sqe);
}
/*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
* This gets called when the page is unlocked, and we generally expect that to
* happen when the page IO is completed and the page is now uptodate. This will
@@ -3429,7 +3424,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
- if (req->file->f_op->read_iter)
+ if (likely(req->file->f_op->read_iter))
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
return loop_rw_iter(READ, req, iter);
@@ -3445,43 +3440,40 @@ static bool need_read_all(struct io_kiocb *req)
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
+ struct io_async_rw *rw;
ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ rw = req->async_data;
+ s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
* need to make this conditional.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
- } else {
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
}
- req->result = iov_iter_count(iter);
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req))) {
+ ret = io_setup_async_rw(req, iovec, s, true);
+ return ret ?: -EAGAIN;
+ }
kiocb->ki_flags |= IOCB_NOWAIT;
-
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, READ)) {
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- return ret ?: -EAGAIN;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
}
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
@@ -3490,7 +3482,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
@@ -3503,7 +3495,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+ } else if (ret == req->result || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3514,22 +3506,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
- ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ ret2 = io_setup_async_rw(req, iovec, s, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
+ s = &rw->s;
/*
* Now use our persistent iterator and state, if we aren't already.
* We've restored and mapped the iter to match.
*/
- if (iter != &rw->iter) {
- iter = &rw->iter;
- state = &rw->iter_state;
- }
do {
/*
@@ -3537,11 +3526,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
- iov_iter_advance(iter, ret);
- if (!iov_iter_count(iter))
+ iov_iter_advance(&s->iter, ret);
+ if (!iov_iter_count(&s->iter))
break;
rw->bytes_done += ret;
- iov_iter_save_state(iter, state);
+ iov_iter_save_state(&s->iter, &s->iter_state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
@@ -3555,12 +3544,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
} while (ret > 0);
done:
kiocb_done(kiocb, ret, issue_flags);
@@ -3575,47 +3564,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- return io_prep_rw(req, sqe, WRITE);
+ req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
+ return io_prep_rw(req, sqe);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
- iov_iter_restore(iter, state);
- iovec = NULL;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
+ } else {
+ struct io_async_rw *rw = req->async_data;
+
+ s = &rw->s;
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- req->result = iov_iter_count(iter);
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
- kiocb->ki_flags |= IOCB_NOWAIT;
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req)))
+ goto copy_iov;
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, WRITE))
- goto copy_iov;
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
+ goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ }
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret))
@@ -3635,10 +3623,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
}
kiocb->ki_flags |= IOCB_WRITE;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, iter);
+ if (likely(req->file->f_op->write_iter))
+ ret2 = call_write_iter(req->file, kiocb, &s->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, iter);
+ ret2 = loop_rw_iter(WRITE, req, &s->iter);
else
ret2 = -EINVAL;
@@ -3658,14 +3646,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
done:
kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
- iov_iter_restore(iter, state);
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
}
out_free:
@@ -3801,7 +3789,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
return 0;
}
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = &req->mkdir;
int ret;
@@ -3850,7 +3838,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_symlink *sl = &req->symlink;
int ret;
@@ -3900,7 +3888,7 @@ static int io_linkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_hardlink *lnk = &req->hardlink;
int ret;
@@ -4319,9 +4307,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4334,7 +4322,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4406,9 +4394,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head, *list;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4424,7 +4412,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4757,8 +4745,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4917,23 +4906,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
- struct io_buffer *kbuf;
- kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
- if (IS_ERR(kbuf))
- return kbuf;
-
- sr->kbuf = kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
- return kbuf;
+ return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
}
static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
{
- return io_put_kbuf(req, req->sr_msg.kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4981,8 +4963,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4990,7 +4973,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
}
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5042,7 +5025,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
@@ -5173,7 +5156,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req->async_data) {
+ if (req_has_async_data(req)) {
io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
@@ -5189,7 +5172,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->async_data)
+ if (req_has_async_data(req))
return -EAGAIN;
if (io_alloc_async_data(req)) {
ret = -ENOMEM;
@@ -5349,16 +5332,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
-{
- bool done;
-
- done = __io_poll_complete(req, mask);
- io_commit_cqring(req->ctx);
- return done;
-}
-
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5480,7 +5453,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
req_ref_get(req);
poll->wait.private = req;
+
*poll_ptr = poll;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->flags |= REQ_F_ASYNC_DATA;
}
pt->nr_entries++;
@@ -5604,17 +5580,13 @@ static int io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int rw;
- if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
+ if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ return IO_APOLL_ABORTED;
if (def->pollin) {
- rw = READ;
mask |= POLLIN | POLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5622,14 +5594,9 @@ static int io_arm_poll_handler(struct io_kiocb *req)
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
mask &= ~POLLIN;
} else {
- rw = WRITE;
mask |= POLLOUT | POLLWRNORM;
}
- /* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_nowait(req, rw))
- return IO_APOLL_ABORTED;
-
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return IO_APOLL_ABORTED;
@@ -5690,8 +5657,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5845,7 +5812,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- done = io_poll_complete(req, mask);
+ done = __io_poll_complete(req, mask);
+ io_commit_cqring(req->ctx);
}
spin_unlock(&ctx->completion_lock);
@@ -5921,7 +5889,10 @@ err:
static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
- req_set_fail(req);
+ struct io_timeout_data *data = req->async_data;
+
+ if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+ req_set_fail(req);
io_req_complete_post(req, -ETIME, 0);
}
@@ -6127,7 +6098,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+ IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6138,7 +6110,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
- if (!req->async_data && io_alloc_async_data(req))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
@@ -6295,6 +6269,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 sqe_addr = req->cancel.addr;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_tctx_node *node;
int ret;
@@ -6303,7 +6278,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
goto done;
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
@@ -6312,7 +6287,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
if (ret != -ENOENT)
break;
}
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
done:
if (ret < 0)
req_set_fail(req);
@@ -6339,6 +6314,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6348,10 +6324,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.tags = 0;
up.resv = 0;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret < 0)
req_set_fail(req);
@@ -6447,7 +6423,7 @@ static int io_req_prep_async(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (WARN_ON_ONCE(req->async_data))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -EAGAIN;
@@ -6479,68 +6455,39 @@ static u32 io_get_sequence(struct io_kiocb *req)
return seq;
}
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
{
- struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
- u32 seq;
-
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- return true;
- }
-
- /*
- * If we need to drain a request in the middle of a link, drain the
- * head request and the next request/link after the current link.
- * Considering sequential execution of links, IOSQE_IO_DRAIN will be
- * maintained for every request of our link.
- */
- if (ctx->drain_next) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = false;
- }
- /* not interested in head, start from the first linked */
- io_for_each_link(pos, req->link) {
- if (pos->flags & REQ_F_IO_DRAIN) {
- ctx->drain_next = true;
- req->flags |= REQ_F_IO_DRAIN;
- break;
- }
- }
+ u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
- if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN))) {
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+queue:
ctx->drain_active = false;
- return false;
+ io_req_task_queue(req);
+ return;
}
- seq = io_get_sequence(req);
- /* Still a chance to pass the sequence check */
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return false;
-
ret = io_req_prep_async(req);
- if (ret)
- goto fail;
+ if (ret) {
+fail:
+ io_req_complete_failed(req, ret);
+ return;
+ }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
-fail:
- io_req_complete_failed(req, ret);
- return true;
+ goto fail;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req, NULL);
- return true;
+ goto queue;
}
trace_io_uring_defer(ctx, req, req->user_data);
@@ -6548,23 +6495,13 @@ fail:
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
- return true;
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- kfree((void *)(unsigned long)req->rw.addr);
- break;
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- kfree(req->sr_msg.kbuf);
- break;
- }
+ kfree(req->kbuf);
+ req->kbuf = NULL;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6629,17 +6566,19 @@ static void io_clean_op(struct io_kiocb *req)
}
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
-
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
const struct cred *creds = NULL;
int ret;
- if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
switch (req->opcode) {
@@ -6762,8 +6701,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (ret)
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req);
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req, issue_flags);
return 0;
}
@@ -6779,6 +6718,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ unsigned int issue_flags = IO_URING_F_UNLOCKED;
+ bool needs_poll = false;
struct io_kiocb *timeout;
int ret = 0;
@@ -6793,23 +6734,42 @@ static void io_wq_submit_work(struct io_wq_work *work)
io_queue_linked_timeout(timeout);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL)
- ret = -ECANCELED;
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+ io_req_task_queue_fail(req, -ECANCELED);
+ return;
+ }
- if (!ret) {
- do {
- ret = io_issue_sqe(req, 0);
- /*
- * We can get EAGAIN for polled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ bool opcode_poll = def->pollin || def->pollout;
+
+ if (opcode_poll && file_can_poll(req->file)) {
+ needs_poll = true;
+ issue_flags |= IO_URING_F_NONBLOCK;
+ }
}
+ do {
+ ret = io_issue_sqe(req, issue_flags);
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * We can get EAGAIN for iopolled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (!needs_poll) {
+ cond_resched();
+ continue;
+ }
+
+ if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ return;
+ /* aborted or ready, in either case retry blocking */
+ needs_poll = false;
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+ } while (1);
+
/* avoid locking problems by failing it from a clean context */
if (ret)
io_req_task_queue_fail(req, ret);
@@ -6833,12 +6793,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_nowait(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_nowait(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
+ file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
@@ -6855,8 +6810,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
- io_req_set_rsrc_node(req);
+ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ io_req_set_rsrc_node(req, ctx);
return file;
}
@@ -6948,67 +6903,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+ switch (io_arm_poll_handler(req)) {
+ case IO_APOLL_READY:
+ if (linked_timeout) {
+ io_queue_linked_timeout(linked_timeout);
+ linked_timeout = NULL;
+ }
+ io_req_task_queue(req);
+ break;
+ case IO_APOLL_ABORTED:
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req, NULL);
+ break;
+ }
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
int ret;
-issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ io_req_add_compl_list(req);
+ return;
+ }
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
- return;
- }
-
linked_timeout = io_prep_linked_timeout(req);
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req)) {
- case IO_APOLL_READY:
- if (linked_timeout)
- io_unprep_linked_timeout(req);
- goto issue_sqe;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ io_queue_sqe_arm_apoll(req);
} else {
io_req_complete_failed(req, ret);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (unlikely(req->ctx->drain_active) && io_drain_req(req))
- return;
-
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
- __io_queue_sqe(req);
- } else if (req->flags & REQ_F_FAIL) {
+ if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
+ } else if (unlikely(req->ctx->drain_active)) {
+ io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
@@ -7019,6 +6973,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
}
}
+static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+ __io_queue_sqe(req);
+ else
+ io_queue_sqe_fallback(req);
+}
+
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7028,9 +6991,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (likely(!ctx->restricted))
- return true;
-
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -7045,16 +7005,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
return true;
}
+static void io_init_req_drain(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *head = ctx->submit_state.link.head;
+
+ ctx->drain_active = true;
+ if (head) {
+ /*
+ * If we need to drain a request in the middle of a link, drain
+ * the head request and the next request/link after the current
+ * link. Considering sequential execution of links,
+ * IOSQE_IO_DRAIN will be maintained for every request of our
+ * link.
+ */
+ head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ ctx->drain_next = true;
+ }
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state;
unsigned int sqe_flags;
- int personality, ret = 0;
+ int personality;
+ u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = READ_ONCE(sqe->opcode);
+ req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
@@ -7062,19 +7041,52 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->fixed_rsrc_refs = NULL;
req->task = current;
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
- if (unlikely(req->opcode >= IORING_OP_LAST))
+ if (unlikely(opcode >= IORING_OP_LAST)) {
+ req->opcode = 0;
return -EINVAL;
- if (!io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ }
+ if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+ /* enforce forwards compatibility on users */
+ if (sqe_flags & ~SQE_VALID_FLAGS)
+ return -EINVAL;
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_IO_DRAIN)
+ io_init_req_drain(req);
+ }
+ if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+ if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+ return -EACCES;
+ /* knock it to the slow queue path, will be drained there */
+ if (ctx->drain_active)
+ req->flags |= REQ_F_FORCE_ASYNC;
+ /* if there is no link, we're at "next" request and need to drain */
+ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+ ctx->drain_next = false;
+ ctx->drain_active = true;
+ req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ }
+ }
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
- if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
- ctx->drain_active = true;
+ if (io_op_defs[opcode].needs_file) {
+ struct io_submit_state *state = &ctx->submit_state;
+
+ /*
+ * Plug now if we have more than 2 IO left after this, and the
+ * target is potentially a read/write to block based storage.
+ */
+ if (state->need_plug && io_op_defs[opcode].plug) {
+ state->plug_started = true;
+ state->need_plug = false;
+ blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ }
+
+ req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+ (sqe_flags & IOSQE_FIXED_FILE));
+ if (unlikely(!req->file))
+ return -EBADF;
+ }
personality = READ_ONCE(sqe->personality);
if (personality) {
@@ -7084,27 +7096,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
get_cred(req->creds);
req->flags |= REQ_F_CREDS;
}
- state = &ctx->submit_state;
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- ret = -EBADF;
- }
-
- state->ios_left--;
- return ret;
+ return io_req_prep(req, sqe);
}
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7116,7 +7109,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
-fail_req:
+ trace_io_uring_req_failed(sqe, ret);
+
/* fail even hard links since we don't submit */
if (link->head) {
/*
@@ -7139,10 +7133,6 @@ fail_req:
return ret;
}
req_fail_link_node(req, ret);
- } else {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
}
/* don't need @sqe from now on */
@@ -7172,33 +7162,32 @@ fail_req:
link->last->link = req;
link->last = req;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ return 0;
/* last request of a link, enqueue the link */
- if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- link->head = NULL;
- io_queue_sqe(head);
- }
- } else {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- } else {
- io_queue_sqe(req);
- }
+ link->head = NULL;
+ req = head;
+ } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ link->head = req;
+ link->last = req;
+ return 0;
}
+ io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state,
- struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->compl_nr)
- io_submit_flush_completions(ctx);
+ /* flush only after queuing links as they can generate completions */
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
@@ -7210,7 +7199,8 @@ static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
- state->ios_left = max_ios;
+ state->need_plug = max_ios > 2;
+ state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
@@ -7262,45 +7252,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
+ unsigned int entries = io_sqring_entries(ctx);
int submitted = 0;
+ if (unlikely(!entries))
+ return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
- if (!percpu_ref_tryget_many(&ctx->refs, nr))
- return -EAGAIN;
+ nr = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(nr);
io_submit_state_start(&ctx->submit_state, nr);
- while (submitted < nr) {
+ do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- req = io_alloc_req(ctx);
- if (unlikely(!req)) {
+ if (unlikely(!io_alloc_req_refill(ctx))) {
if (!submitted)
submitted = -EAGAIN;
break;
}
+ req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (io_submit_sqe(ctx, req, sqe))
break;
- }
+ } while (submitted < nr);
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
int unused = nr - ref_used;
current->io_uring->cached_refs += unused;
- percpu_ref_put_many(&ctx->refs, unused);
}
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7339,16 +7329,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!list_empty(&ctx->iopoll_list) || to_submit) {
- unsigned nr_events = 0;
+ if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ if (!wq_list_empty(&ctx->iopoll_list))
+ io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7368,7 +7357,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
return ret;
}
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
@@ -7425,7 +7414,7 @@ static int io_sq_thread(void *data)
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
- if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
@@ -7446,7 +7435,7 @@ static int io_sq_thread(void *data)
io_ring_set_wakeup_flag(ctx);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
+ !wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
@@ -7622,7 +7611,7 @@ static void io_free_page_table(void **table, size_t size)
kfree(table);
}
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
{
unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
size_t init_size = size;
@@ -7651,7 +7640,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
@@ -7697,10 +7686,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
+ __must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+ io_rsrc_refs_drop(ctx);
+
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
@@ -7728,7 +7720,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+ struct io_ring_ctx *ctx)
{
int ret;
@@ -7784,9 +7777,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
@@ -8354,12 +8347,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8400,7 +8393,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret)
fput(file);
return ret;
@@ -8410,11 +8403,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
int ret, i;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -8440,7 +8434,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
return ret;
}
@@ -8556,8 +8550,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8604,8 +8598,8 @@ void __io_uring_free(struct task_struct *tsk)
tsk->io_uring = NULL;
}
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
@@ -9216,29 +9210,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
}
}
-static void io_req_cache_free(struct list_head *list)
-{
- struct io_kiocb *req, *nxt;
-
- list_for_each_entry_safe(req, nxt, list, inflight_entry) {
- list_del(&req->inflight_entry);
- kmem_cache_free(req_cachep, req);
- }
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
+ int nr = 0;
mutex_lock(&ctx->uring_lock);
+ io_flush_cached_locked_reqs(ctx, state);
- if (state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
- state->free_reqs = 0;
- }
+ while (state->free_list.next) {
+ struct io_wq_work_node *node;
+ struct io_kiocb *req;
- io_flush_cached_locked_reqs(ctx, state);
- io_req_cache_free(&state->free_list);
+ node = wq_stack_extract(&state->free_list);
+ req = container_of(node, struct io_kiocb, comp_list);
+ kmem_cache_free(req_cachep, req);
+ nr++;
+ }
+ if (nr)
+ percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -9248,7 +9238,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9257,6 +9247,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
@@ -9280,6 +9271,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
+ flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9310,7 +9302,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->poll_wait, wait);
+ poll_wait(file, &ctx->cq_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -9357,7 +9349,7 @@ struct io_tctx_exit {
struct io_ring_ctx *ctx;
};
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
@@ -9372,14 +9364,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9408,6 +9400,8 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
+ io_req_caches_free(ctx);
+
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
@@ -9434,7 +9428,6 @@ static void io_ring_exit_work(struct work_struct *work)
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
- wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
@@ -9448,8 +9441,8 @@ static void io_ring_exit_work(struct work_struct *work)
}
/* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
@@ -9471,7 +9464,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
return canceled != 0;
}
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
@@ -9533,8 +9526,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
return ret;
}
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
@@ -9559,7 +9553,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
return true;
}
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
@@ -9583,9 +9577,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9609,7 +9603,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!list_empty_careful(&ctx->iopoll_list)) {
+ while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
@@ -9636,7 +9630,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
+
tctx = current->io_uring;
+ if (ctx->iowq_limits_set) {
+ unsigned int limits[2] = { ctx->iowq_limits[0],
+ ctx->iowq_limits[1], };
+
+ ret = io_wq_max_workers(tctx->io_wq, limits);
+ if (ret)
+ return ret;
+ }
}
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
@@ -9675,7 +9678,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9698,7 +9701,7 @@ static void io_uring_del_tctx_node(unsigned long index)
kfree(node);
}
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
@@ -9725,7 +9728,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_drop_tctx_refs(struct task_struct *task)
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
unsigned int refs = tctx->cached_refs;
@@ -9741,7 +9744,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
* Find any io_uring ctx that this task has registered or done IO on, and cancel
* requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
*/
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+ struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
@@ -9834,7 +9838,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
#ifdef CONFIG_MMU
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
@@ -10019,7 +10023,7 @@ out_fput:
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
@@ -10051,11 +10055,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
{
struct io_sq_data *sq = NULL;
+ struct io_overflow_cqe *ocqe;
+ struct io_rings *r = ctx->rings;
+ unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+ unsigned int sq_head = READ_ONCE(r->sq.head);
+ unsigned int sq_tail = READ_ONCE(r->sq.tail);
+ unsigned int cq_head = READ_ONCE(r->cq.head);
+ unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int sq_entries, cq_entries;
bool has_lock;
- int i;
+ unsigned int i;
+
+ /*
+ * we may get imprecise sqe and cqe info if uring is actively running
+ * since we get cached_sq_head and cached_cq_tail without uring_lock
+ * and sq_tail and cq_head are changed by userspace. But it's ok since
+ * we usually use these info when it is stuck.
+ */
+ seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
+ seq_printf(m, "SqHead:\t%u\n", sq_head);
+ seq_printf(m, "SqTail:\t%u\n", sq_tail);
+ seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+ seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+ seq_printf(m, "CqHead:\t%u\n", cq_head);
+ seq_printf(m, "CqTail:\t%u\n", cq_tail);
+ seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+ for (i = 0; i < sq_entries; i++) {
+ unsigned int entry = i + sq_head;
+ unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
+
+ if (sq_idx > sq_mask)
+ continue;
+ sqe = &ctx->sq_sqes[sq_idx];
+ seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+ sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+ sqe->user_data);
+ }
+ seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+ cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+ for (i = 0; i < cq_entries; i++) {
+ unsigned int entry = i + cq_head;
+ struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags);
+ }
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10097,7 +10149,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- seq_printf(m, "PollList:\n");
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
+ seq_puts(m, "PollList:\n");
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10107,12 +10162,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
req->task->task_works != NULL);
}
+
+ seq_puts(m, "CqOverflowList:\n");
+ list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+ struct io_uring_cqe *cqe = &ocqe->cqe;
+
+ seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
+ cqe->user_data, cqe->res, cqe->flags);
+
+ }
+
spin_unlock(&ctx->completion_lock);
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
}
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
@@ -10136,8 +10199,8 @@ static const struct file_operations io_uring_fops = {
#endif
};
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
@@ -10226,8 +10289,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return file;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
@@ -10385,7 +10448,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
@@ -10441,8 +10505,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
@@ -10576,7 +10640,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -10602,8 +10666,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
- unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
@@ -10629,7 +10693,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
@@ -10639,9 +10703,11 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+ __must_hold(&ctx->uring_lock)
{
+ struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
@@ -10672,13 +10738,19 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
tctx = current->io_uring;
}
- ret = -EINVAL;
- if (!tctx || !tctx->io_wq)
- goto err;
+ BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
+ memcpy(ctx->iowq_limits, new_count, sizeof(new_count));
+ ctx->iowq_limits_set = true;
+
+ ret = -EINVAL;
+ if (tctx && tctx->io_wq) {
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ goto err;
+ } else {
+ memset(new_count, 0, sizeof(new_count));
+ }
if (sqd) {
mutex_unlock(&sqd->lock);
@@ -10688,6 +10760,22 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
+ /* that's it for SQPOLL, only the SQPOLL task creates requests */
+ if (sqd)
+ return 0;
+
+ /* now propagate the restriction to all registered users */
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ if (WARN_ON_ONCE(!tctx->io_wq))
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ new_count[i] = ctx->iowq_limits[i];
+ /* ignore errors, it always returns zero anyway */
+ (void)io_wq_max_workers(tctx->io_wq, new_count);
+ }
return 0;
err:
if (sqd) {
@@ -10721,7 +10809,7 @@ static bool io_register_op_must_quiesce(int op)
}
}
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
{
long ret;
@@ -10736,10 +10824,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx)
*/
mutex_unlock(&ctx->uring_lock);
do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
+ ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
+ if (ret) {
+ ret = min(0L, ret);
break;
+ }
+
ret = io_run_task_work_sig();
+ io_req_caches_free(ctx);
} while (ret >= 0);
mutex_lock(&ctx->uring_lock);
@@ -10970,6 +11062,8 @@ static int __init io_uring_init(void)
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+ BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+ BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 176580f54af9..104ae698443e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 87aac4c72c37..1b07550485b9 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -178,7 +178,7 @@ int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
struct fd f = fdget(fd);
int ret = -EBADF;
- if (!f.file)
+ if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 71c989f1568d..30a92ddc1817 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -298,8 +298,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
int blob_len, struct ksmbd_session *sess)
{
char *domain_name;
- unsigned int lm_off, nt_off;
- unsigned short nt_len;
+ unsigned int nt_off, dn_off;
+ unsigned short nt_len, dn_len;
int ret;
if (blob_len < sizeof(struct authenticate_message)) {
@@ -314,15 +314,17 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
return -EINVAL;
}
- lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset);
nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset);
nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length);
+ dn_off = le32_to_cpu(authblob->DomainName.BufferOffset);
+ dn_len = le16_to_cpu(authblob->DomainName.Length);
+
+ if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len)
+ return -EINVAL;
/* TODO : use domain name that imported from configuration file */
- domain_name = smb_strndup_from_utf16((const char *)authblob +
- le32_to_cpu(authblob->DomainName.BufferOffset),
- le16_to_cpu(authblob->DomainName.Length), true,
- sess->conn->local_nls);
+ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off,
+ dn_len, true, sess->conn->local_nls);
if (IS_ERR(domain_name))
return PTR_ERR(domain_name);
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 48b18b4ec117..b57a0d8a392f 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -61,6 +61,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->local_nls = load_nls_default();
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
+ conn->total_credits = 1;
+
init_waitqueue_head(&conn->req_running_q);
INIT_LIST_HEAD(&conn->conns_list);
INIT_LIST_HEAD(&conn->sessions);
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index 2fbe2bc1e093..c6718a05d347 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -211,6 +211,7 @@ struct ksmbd_tree_disconnect_request {
*/
struct ksmbd_logout_request {
__s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+ __u32 account_flags;
};
/*
@@ -317,6 +318,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_USER_FLAG_BAD_UID BIT(2)
#define KSMBD_USER_FLAG_BAD_USER BIT(3)
#define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4)
+#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5)
/*
* Share config flags.
diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c
index d21629ae5c89..1019d3677d55 100644
--- a/fs/ksmbd/mgmt/user_config.c
+++ b/fs/ksmbd/mgmt/user_config.c
@@ -55,7 +55,7 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp)
void ksmbd_free_user(struct ksmbd_user *user)
{
- ksmbd_ipc_logout_request(user->name);
+ ksmbd_ipc_logout_request(user->name, user->flags);
kfree(user->name);
kfree(user->passkey);
kfree(user);
diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h
index b2bb074a0150..aff80b029579 100644
--- a/fs/ksmbd/mgmt/user_config.h
+++ b/fs/ksmbd/mgmt/user_config.h
@@ -18,6 +18,7 @@ struct ksmbd_user {
size_t passkey_sz;
char *passkey;
+ unsigned int failed_login_count;
};
static inline bool user_guest(struct ksmbd_user *user)
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 9edd9c161b27..030ca57c3784 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -284,11 +284,13 @@ static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h)
le32_to_cpu(h->MaxOutputResponse);
}
-static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
+static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
+ struct smb2_hdr *hdr)
{
- int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
- int credit_charge = le16_to_cpu(hdr->CreditCharge);
+ unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
+ unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge);
void *__hdr = hdr;
+ int ret;
switch (hdr->Command) {
case SMB2_QUERY_INFO:
@@ -310,21 +312,37 @@ static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
req_len = smb2_ioctl_req_len(__hdr);
expect_resp_len = smb2_ioctl_resp_len(__hdr);
break;
- default:
+ case SMB2_CANCEL:
return 0;
+ default:
+ req_len = 1;
+ break;
}
- credit_charge = max(1, credit_charge);
- max_len = max(req_len, expect_resp_len);
+ credit_charge = max_t(unsigned short, credit_charge, 1);
+ max_len = max_t(unsigned int, req_len, expect_resp_len);
calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE);
if (credit_charge < calc_credit_num) {
- pr_err("Insufficient credit charge, given: %d, needed: %d\n",
- credit_charge, calc_credit_num);
+ ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n",
+ credit_charge, calc_credit_num);
+ return 1;
+ } else if (credit_charge > conn->max_credits) {
+ ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge);
return 1;
}
- return 0;
+ spin_lock(&conn->credits_lock);
+ if (credit_charge <= conn->total_credits) {
+ conn->total_credits -= credit_charge;
+ ret = 0;
+ } else {
+ ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n",
+ credit_charge, conn->total_credits);
+ ret = 1;
+ }
+ spin_unlock(&conn->credits_lock);
+ return ret;
}
int ksmbd_smb2_check_message(struct ksmbd_work *work)
@@ -382,26 +400,20 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
}
}
- if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
- smb2_validate_credit_charge(hdr)) {
- work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
- return 1;
- }
-
if (smb2_calc_size(hdr, &clc_len))
return 1;
if (len != clc_len) {
/* client can return one byte more due to implied bcc[0] */
if (clc_len == len + 1)
- return 0;
+ goto validate_credit;
/*
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
if (ALIGN(clc_len, 8) == len)
- return 0;
+ goto validate_credit;
/*
* windows client also pad up to 8 bytes when compounding.
@@ -414,7 +426,7 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
"cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
len, clc_len, command,
le64_to_cpu(hdr->MessageId));
- return 0;
+ goto validate_credit;
}
ksmbd_debug(SMB,
@@ -425,6 +437,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
return 1;
}
+validate_credit:
+ if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
+ smb2_validate_credit_charge(work->conn, hdr)) {
+ work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ return 1;
+ }
+
return 0;
}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index b06456eb587b..fb6a65d23139 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -284,6 +284,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
void init_smb2_max_read_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_read_size = sz;
smb30_server_values.max_read_size = sz;
smb302_server_values.max_read_size = sz;
@@ -292,6 +293,7 @@ void init_smb2_max_read_size(unsigned int sz)
void init_smb2_max_write_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_write_size = sz;
smb30_server_values.max_write_size = sz;
smb302_server_values.max_write_size = sz;
@@ -300,6 +302,7 @@ void init_smb2_max_write_size(unsigned int sz)
void init_smb2_max_trans_size(unsigned int sz)
{
+ sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE);
smb21_server_values.max_trans_size = sz;
smb30_server_values.max_trans_size = sz;
smb302_server_values.max_trans_size = sz;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 005aa93a49d6..7e448df3f847 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -292,22 +292,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
return 0;
}
-static int smb2_consume_credit_charge(struct ksmbd_work *work,
- unsigned short credit_charge)
-{
- struct ksmbd_conn *conn = work->conn;
- unsigned int rsp_credits = 1;
-
- if (!conn->total_credits)
- return 0;
-
- if (credit_charge > 0)
- rsp_credits = credit_charge;
-
- conn->total_credits -= rsp_credits;
- return rsp_credits;
-}
-
/**
* smb2_set_rsp_credits() - set number of credits in response buffer
* @work: smb work containing smb response buffer
@@ -317,49 +301,43 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
struct smb2_hdr *hdr = ksmbd_resp_buf_next(work);
struct ksmbd_conn *conn = work->conn;
- unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest);
- unsigned short credit_charge = 1, credits_granted = 0;
- unsigned short aux_max, aux_credits, min_credits;
- int rsp_credit_charge;
+ unsigned short credits_requested;
+ unsigned short credit_charge, credits_granted = 0;
+ unsigned short aux_max, aux_credits;
- if (hdr->Command == SMB2_CANCEL)
- goto out;
+ if (work->send_no_response)
+ return 0;
- /* get default minimum credits by shifting maximum credits by 4 */
- min_credits = conn->max_credits >> 4;
+ hdr->CreditCharge = req_hdr->CreditCharge;
- if (conn->total_credits >= conn->max_credits) {
+ if (conn->total_credits > conn->max_credits) {
+ hdr->CreditRequest = 0;
pr_err("Total credits overflow: %d\n", conn->total_credits);
- conn->total_credits = min_credits;
- }
-
- rsp_credit_charge =
- smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge));
- if (rsp_credit_charge < 0)
return -EINVAL;
+ }
- hdr->CreditCharge = cpu_to_le16(rsp_credit_charge);
+ credit_charge = max_t(unsigned short,
+ le16_to_cpu(req_hdr->CreditCharge), 1);
+ credits_requested = max_t(unsigned short,
+ le16_to_cpu(req_hdr->CreditRequest), 1);
- if (credits_requested > 0) {
- aux_credits = credits_requested - 1;
- aux_max = 32;
- if (hdr->Command == SMB2_NEGOTIATE)
- aux_max = 0;
- aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max;
- credits_granted = aux_credits + credit_charge;
+ /* according to smb2.credits smbtorture, Windows server
+ * 2016 or later grant up to 8192 credits at once.
+ *
+ * TODO: Need to adjuct CreditRequest value according to
+ * current cpu load
+ */
+ aux_credits = credits_requested - 1;
+ if (hdr->Command == SMB2_NEGOTIATE)
+ aux_max = 0;
+ else
+ aux_max = conn->max_credits - credit_charge;
+ aux_credits = min_t(unsigned short, aux_credits, aux_max);
+ credits_granted = credit_charge + aux_credits;
- /* if credits granted per client is getting bigger than default
- * minimum credits then we should wrap it up within the limits.
- */
- if ((conn->total_credits + credits_granted) > min_credits)
- credits_granted = min_credits - conn->total_credits;
- /*
- * TODO: Need to adjuct CreditRequest value according to
- * current cpu load
- */
- } else if (conn->total_credits == 0) {
- credits_granted = 1;
- }
+ if (conn->max_credits - conn->total_credits < credits_granted)
+ credits_granted = conn->max_credits -
+ conn->total_credits;
conn->total_credits += credits_granted;
work->credits_granted += credits_granted;
@@ -368,7 +346,6 @@ int smb2_set_rsp_credits(struct ksmbd_work *work)
/* Update CreditRequest in last request */
hdr->CreditRequest = cpu_to_le16(work->credits_granted);
}
-out:
ksmbd_debug(SMB,
"credits: requested[%d] granted[%d] total_granted[%d]\n",
credits_requested, credits_granted,
@@ -472,6 +449,12 @@ bool is_chained_smb2_message(struct ksmbd_work *work)
return false;
}
+ if ((u64)get_rfc1002_len(work->response_buf) + MAX_CIFS_SMALL_BUFFER_SIZE >
+ work->response_sz) {
+ pr_err("next response offset exceeds response buffer size\n");
+ return false;
+ }
+
ksmbd_debug(SMB, "got SMB2 chained command\n");
init_chained_smb2_rsp(work);
return true;
@@ -541,7 +524,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
{
struct smb2_hdr *hdr = work->request_buf;
size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
- size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE;
+ size_t large_sz = small_sz + work->conn->vals->max_trans_size;
size_t sz = small_sz;
int cmd = le16_to_cpu(hdr->Command);
@@ -1274,19 +1257,13 @@ static int generate_preauth_hash(struct ksmbd_work *work)
return 0;
}
-static int decode_negotiation_token(struct ksmbd_work *work,
- struct negotiate_message *negblob)
+static int decode_negotiation_token(struct ksmbd_conn *conn,
+ struct negotiate_message *negblob,
+ size_t sz)
{
- struct ksmbd_conn *conn = work->conn;
- struct smb2_sess_setup_req *req;
- int sz;
-
if (!conn->use_spnego)
return -EINVAL;
- req = work->request_buf;
- sz = le16_to_cpu(req->SecurityBufferLength);
-
if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) {
if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) {
conn->auth_mechs |= KSMBD_AUTH_NTLMSSP;
@@ -1298,9 +1275,9 @@ static int decode_negotiation_token(struct ksmbd_work *work,
}
static int ntlm_negotiate(struct ksmbd_work *work,
- struct negotiate_message *negblob)
+ struct negotiate_message *negblob,
+ size_t negblob_len)
{
- struct smb2_sess_setup_req *req = work->request_buf;
struct smb2_sess_setup_rsp *rsp = work->response_buf;
struct challenge_message *chgblob;
unsigned char *spnego_blob = NULL;
@@ -1309,8 +1286,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
int sz, rc;
ksmbd_debug(SMB, "negotiate phase\n");
- sz = le16_to_cpu(req->SecurityBufferLength);
- rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess);
+ rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess);
if (rc)
return rc;
@@ -1378,12 +1354,23 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
struct authenticate_message *authblob;
struct ksmbd_user *user;
char *name;
- int sz;
+ unsigned int auth_msg_len, name_off, name_len, secbuf_len;
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (secbuf_len < sizeof(struct authenticate_message)) {
+ ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
+ return NULL;
+ }
authblob = user_authblob(conn, req);
- sz = le32_to_cpu(authblob->UserName.BufferOffset);
- name = smb_strndup_from_utf16((const char *)authblob + sz,
- le16_to_cpu(authblob->UserName.Length),
+ name_off = le32_to_cpu(authblob->UserName.BufferOffset);
+ name_len = le16_to_cpu(authblob->UserName.Length);
+ auth_msg_len = le16_to_cpu(req->SecurityBufferOffset) + secbuf_len;
+
+ if (auth_msg_len < (u64)name_off + name_len)
+ return NULL;
+
+ name = smb_strndup_from_utf16((const char *)authblob + name_off,
+ name_len,
true,
conn->local_nls);
if (IS_ERR(name)) {
@@ -1629,6 +1616,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
struct smb2_sess_setup_rsp *rsp = work->response_buf;
struct ksmbd_session *sess;
struct negotiate_message *negblob;
+ unsigned int negblob_len, negblob_off;
int rc = 0;
ksmbd_debug(SMB, "Received request for session setup\n");
@@ -1709,10 +1697,16 @@ int smb2_sess_setup(struct ksmbd_work *work)
if (sess->state == SMB2_SESSION_EXPIRED)
sess->state = SMB2_SESSION_IN_PROGRESS;
+ negblob_off = le16_to_cpu(req->SecurityBufferOffset);
+ negblob_len = le16_to_cpu(req->SecurityBufferLength);
+ if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) ||
+ negblob_len < offsetof(struct negotiate_message, NegotiateFlags))
+ return -EINVAL;
+
negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId +
- le16_to_cpu(req->SecurityBufferOffset));
+ negblob_off);
- if (decode_negotiation_token(work, negblob) == 0) {
+ if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
if (conn->mechToken)
negblob = (struct negotiate_message *)conn->mechToken;
}
@@ -1736,7 +1730,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
if (negblob->MessageType == NtLmNegotiate) {
- rc = ntlm_negotiate(work, negblob);
+ rc = ntlm_negotiate(work, negblob, negblob_len);
if (rc)
goto out_err;
rsp->hdr.Status =
@@ -1796,9 +1790,30 @@ out_err:
conn->mechToken = NULL;
}
- if (rc < 0 && sess) {
- ksmbd_session_destroy(sess);
- work->sess = NULL;
+ if (rc < 0) {
+ /*
+ * SecurityBufferOffset should be set to zero
+ * in session setup error response.
+ */
+ rsp->SecurityBufferOffset = 0;
+
+ if (sess) {
+ bool try_delay = false;
+
+ /*
+ * To avoid dictionary attacks (repeated session setups rapidly sent) to
+ * connect to server, ksmbd make a delay of a 5 seconds on session setup
+ * failure to make it harder to send enough random connection requests
+ * to break into a server.
+ */
+ if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION)
+ try_delay = true;
+
+ ksmbd_session_destroy(sess);
+ work->sess = NULL;
+ if (try_delay)
+ ssleep(5);
+ }
}
return rc;
@@ -3779,6 +3794,24 @@ static int verify_info_level(int info_level)
return 0;
}
+static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
+ unsigned short hdr2_len,
+ unsigned int out_buf_len)
+{
+ int free_len;
+
+ if (out_buf_len > work->conn->vals->max_trans_size)
+ return -EINVAL;
+
+ free_len = (int)(work->response_sz -
+ (get_rfc1002_len(work->response_buf) + 4)) -
+ hdr2_len;
+ if (free_len < 0)
+ return -EINVAL;
+
+ return min_t(int, out_buf_len, free_len);
+}
+
int smb2_query_dir(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
@@ -3855,9 +3888,13 @@ int smb2_query_dir(struct ksmbd_work *work)
memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
d_info.wptr = (char *)rsp->Buffer;
d_info.rptr = (char *)rsp->Buffer;
- d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4));
- d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) -
- sizeof(struct smb2_query_directory_rsp);
+ d_info.out_buf_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (d_info.out_buf_len < 0) {
+ rc = -EINVAL;
+ goto err_out;
+ }
d_info.flags = srch_flag;
/*
@@ -4091,12 +4128,11 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
le32_to_cpu(req->Flags));
}
- buf_free_len = work->response_sz -
- (get_rfc1002_len(rsp_org) + 4) -
- sizeof(struct smb2_query_info_rsp);
-
- if (le32_to_cpu(req->OutputBufferLength) < buf_free_len)
- buf_free_len = le32_to_cpu(req->OutputBufferLength);
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ return -EINVAL;
rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
if (rc < 0) {
@@ -4407,6 +4443,8 @@ static void get_file_stream_info(struct ksmbd_work *work,
struct path *path = &fp->filp->f_path;
ssize_t xattr_list_len;
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
+ int buf_free_len;
+ struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
&stat);
@@ -4420,6 +4458,12 @@ static void get_file_stream_info(struct ksmbd_work *work,
goto out;
}
+ buf_free_len =
+ smb2_calc_max_out_buf_len(work, 8,
+ le32_to_cpu(req->OutputBufferLength));
+ if (buf_free_len < 0)
+ goto out;
+
while (idx < xattr_list_len) {
stream_name = xattr_list + idx;
streamlen = strlen(stream_name);
@@ -4444,6 +4488,10 @@ static void get_file_stream_info(struct ksmbd_work *work,
streamlen = snprintf(stream_buf, streamlen + 1,
":%s", &stream_name[XATTR_NAME_STREAM_LEN]);
+ next = sizeof(struct smb2_file_stream_info) + streamlen * 2;
+ if (next > buf_free_len)
+ break;
+
file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
stream_buf, streamlen,
@@ -4454,12 +4502,13 @@ static void get_file_stream_info(struct ksmbd_work *work,
file_info->StreamSize = cpu_to_le64(stream_name_len);
file_info->StreamAllocationSize = cpu_to_le64(stream_name_len);
- next = sizeof(struct smb2_file_stream_info) + streamlen;
nbytes += next;
+ buf_free_len -= next;
file_info->NextEntryOffset = cpu_to_le32(next);
}
- if (!S_ISDIR(stat.mode)) {
+ if (!S_ISDIR(stat.mode) &&
+ buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) {
file_info = (struct smb2_file_stream_info *)
&rsp->Buffer[nbytes];
streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
@@ -6220,8 +6269,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
(offsetof(struct smb2_write_req, Buffer) - 4)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
- (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
get_rfc1002_len(req));
@@ -6379,8 +6427,7 @@ int smb2_write(struct ksmbd_work *work)
(offsetof(struct smb2_write_req, Buffer) - 4)) {
data_buf = (char *)&req->Buffer[0];
} else {
- if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
- (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) {
pr_err("invalid write data offset %u, smb_len %u\n",
le16_to_cpu(req->DataOffset),
get_rfc1002_len(req));
@@ -7023,24 +7070,26 @@ out2:
return err;
}
-static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
+static int fsctl_copychunk(struct ksmbd_work *work,
+ struct copychunk_ioctl_req *ci_req,
+ unsigned int cnt_code,
+ unsigned int input_count,
+ unsigned long long volatile_id,
+ unsigned long long persistent_id,
struct smb2_ioctl_rsp *rsp)
{
- struct copychunk_ioctl_req *ci_req;
struct copychunk_ioctl_rsp *ci_rsp;
struct ksmbd_file *src_fp = NULL, *dst_fp = NULL;
struct srv_copychunk *chunks;
unsigned int i, chunk_count, chunk_count_written = 0;
unsigned int chunk_size_written = 0;
loff_t total_size_written = 0;
- int ret, cnt_code;
+ int ret = 0;
- cnt_code = le32_to_cpu(req->CntCode);
- ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0];
ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0];
- rsp->VolatileFileId = req->VolatileFileId;
- rsp->PersistentFileId = req->PersistentFileId;
+ rsp->VolatileFileId = cpu_to_le64(volatile_id);
+ rsp->PersistentFileId = cpu_to_le64(persistent_id);
ci_rsp->ChunksWritten =
cpu_to_le32(ksmbd_server_side_copy_max_chunk_count());
ci_rsp->ChunkBytesWritten =
@@ -7050,12 +7099,13 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
chunks = (struct srv_copychunk *)&ci_req->Chunks[0];
chunk_count = le32_to_cpu(ci_req->ChunkCount);
+ if (chunk_count == 0)
+ goto out;
total_size_written = 0;
/* verify the SRV_COPYCHUNK_COPY packet */
if (chunk_count > ksmbd_server_side_copy_max_chunk_count() ||
- le32_to_cpu(req->InputCount) <
- offsetof(struct copychunk_ioctl_req, Chunks) +
+ input_count < offsetof(struct copychunk_ioctl_req, Chunks) +
chunk_count * sizeof(struct srv_copychunk)) {
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
return -EINVAL;
@@ -7076,9 +7126,7 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
src_fp = ksmbd_lookup_foreign_fd(work,
le64_to_cpu(ci_req->ResumeKey[0]));
- dst_fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ dst_fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id);
ret = -EINVAL;
if (!src_fp ||
src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) {
@@ -7153,8 +7201,8 @@ static __be32 idev_ipv4_address(struct in_device *idev)
}
static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
- struct smb2_ioctl_req *req,
- struct smb2_ioctl_rsp *rsp)
+ struct smb2_ioctl_rsp *rsp,
+ unsigned int out_buf_len)
{
struct network_interface_info_ioctl_rsp *nii_rsp = NULL;
int nbytes = 0;
@@ -7166,6 +7214,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
rtnl_lock();
for_each_netdev(&init_net, netdev) {
+ if (out_buf_len <
+ nbytes + sizeof(struct network_interface_info_ioctl_rsp)) {
+ rtnl_unlock();
+ return -ENOSPC;
+ }
+
if (netdev->type == ARPHRD_LOOPBACK)
continue;
@@ -7245,11 +7299,6 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
if (nii_rsp)
nii_rsp->Next = 0;
- if (!nbytes) {
- rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
- return -EINVAL;
- }
-
rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
return nbytes;
@@ -7257,11 +7306,16 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn,
struct validate_negotiate_info_req *neg_req,
- struct validate_negotiate_info_rsp *neg_rsp)
+ struct validate_negotiate_info_rsp *neg_rsp,
+ unsigned int in_buf_len)
{
int ret = 0;
int dialect;
+ if (in_buf_len < sizeof(struct validate_negotiate_info_req) +
+ le16_to_cpu(neg_req->DialectCount) * sizeof(__le16))
+ return -EINVAL;
+
dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects,
neg_req->DialectCount);
if (dialect == BAD_PROT_ID || dialect != conn->dialect) {
@@ -7295,7 +7349,7 @@ err_out:
static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
struct file_allocated_range_buffer *qar_req,
struct file_allocated_range_buffer *qar_rsp,
- int in_count, int *out_count)
+ unsigned int in_count, unsigned int *out_count)
{
struct ksmbd_file *fp;
loff_t start, length;
@@ -7322,7 +7376,8 @@ static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
}
static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
- int out_buf_len, struct smb2_ioctl_req *req,
+ unsigned int out_buf_len,
+ struct smb2_ioctl_req *req,
struct smb2_ioctl_rsp *rsp)
{
struct ksmbd_rpc_command *rpc_resp;
@@ -7436,8 +7491,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp, *rsp_org;
- int cnt_code, nbytes = 0;
- int out_buf_len;
+ unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len;
u64 id = KSMBD_NO_FID;
struct ksmbd_conn *conn = work->conn;
int ret = 0;
@@ -7465,8 +7519,14 @@ int smb2_ioctl(struct ksmbd_work *work)
}
cnt_code = le32_to_cpu(req->CntCode);
- out_buf_len = le32_to_cpu(req->MaxOutputResponse);
- out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
+ ret = smb2_calc_max_out_buf_len(work, 48,
+ le32_to_cpu(req->MaxOutputResponse));
+ if (ret < 0) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ goto out;
+ }
+ out_buf_len = (unsigned int)ret;
+ in_buf_len = le32_to_cpu(req->InputCount);
switch (cnt_code) {
case FSCTL_DFS_GET_REFERRALS:
@@ -7494,6 +7554,7 @@ int smb2_ioctl(struct ksmbd_work *work)
break;
}
case FSCTL_PIPE_TRANSCEIVE:
+ out_buf_len = min_t(u32, KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp);
break;
case FSCTL_VALIDATE_NEGOTIATE_INFO:
@@ -7502,9 +7563,16 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct validate_negotiate_info_req))
+ return -EINVAL;
+
+ if (out_buf_len < sizeof(struct validate_negotiate_info_rsp))
+ return -EINVAL;
+
ret = fsctl_validate_negotiate_info(conn,
(struct validate_negotiate_info_req *)&req->Buffer[0],
- (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]);
+ (struct validate_negotiate_info_rsp *)&rsp->Buffer[0],
+ in_buf_len);
if (ret < 0)
goto out;
@@ -7513,9 +7581,10 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
break;
case FSCTL_QUERY_NETWORK_INTERFACE_INFO:
- nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp);
- if (nbytes < 0)
+ ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len);
+ if (ret < 0)
goto out;
+ nbytes = ret;
break;
case FSCTL_REQUEST_RESUME_KEY:
if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) {
@@ -7540,15 +7609,33 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct copychunk_ioctl_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) {
ret = -EINVAL;
goto out;
}
nbytes = sizeof(struct copychunk_ioctl_rsp);
- fsctl_copychunk(work, req, rsp);
+ rsp->VolatileFileId = req->VolatileFileId;
+ rsp->PersistentFileId = req->PersistentFileId;
+ fsctl_copychunk(work,
+ (struct copychunk_ioctl_req *)&req->Buffer[0],
+ le32_to_cpu(req->CntCode),
+ le32_to_cpu(req->InputCount),
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId),
+ rsp);
break;
case FSCTL_SET_SPARSE:
+ if (in_buf_len < sizeof(struct file_sparse)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = fsctl_set_sparse(work, id,
(struct file_sparse *)&req->Buffer[0]);
if (ret < 0)
@@ -7567,6 +7654,11 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
+ if (in_buf_len < sizeof(struct file_zero_data_information)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
@@ -7586,6 +7678,11 @@ int smb2_ioctl(struct ksmbd_work *work)
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
+ if (in_buf_len < sizeof(struct file_allocated_range_buffer)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = fsctl_query_allocated_ranges(work, id,
(struct file_allocated_range_buffer *)&req->Buffer[0],
(struct file_allocated_range_buffer *)&rsp->Buffer[0],
@@ -7626,6 +7723,11 @@ int smb2_ioctl(struct ksmbd_work *work)
struct duplicate_extents_to_file *dup_ext;
loff_t src_off, dst_off, length, cloned;
+ if (in_buf_len < sizeof(struct duplicate_extents_to_file)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
@@ -7696,6 +7798,8 @@ out:
rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND;
else if (ret == -EOPNOTSUPP)
rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ else if (ret == -ENOSPC)
+ rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
else if (ret < 0 || rsp->hdr.Status == 0)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
smb2_set_err_rsp(work);
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index a6dec5ec6a54..ff5a2f01d34a 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -113,6 +113,8 @@
#define SMB21_DEFAULT_IOSIZE (1024 * 1024)
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024)
+#define SMB3_MIN_IOSIZE (64 * 1024)
+#define SMB3_MAX_IOSIZE (8 * 1024 * 1024)
/*
* SMB2 Header Definition
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 44aea33a67fa..1acf1892a466 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -601,7 +601,7 @@ int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
return ret;
}
-int ksmbd_ipc_logout_request(const char *account)
+int ksmbd_ipc_logout_request(const char *account, int flags)
{
struct ksmbd_ipc_msg *msg;
struct ksmbd_logout_request *req;
@@ -616,6 +616,7 @@ int ksmbd_ipc_logout_request(const char *account)
msg->type = KSMBD_EVENT_LOGOUT_REQUEST;
req = (struct ksmbd_logout_request *)msg->payload;
+ req->account_flags = flags;
strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
ret = ipc_msg_send(msg);
diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h
index 9eacc895ffdb..5e5b90a0c187 100644
--- a/fs/ksmbd/transport_ipc.h
+++ b/fs/ksmbd/transport_ipc.h
@@ -25,7 +25,7 @@ ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess,
struct sockaddr *peer_addr);
int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
unsigned long long connect_id);
-int ksmbd_ipc_logout_request(const char *account);
+int ksmbd_ipc_logout_request(const char *account, int flags);
struct ksmbd_share_config_response *
ksmbd_ipc_share_config_request(const char *name);
struct ksmbd_spnego_authen_response *
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 3a7fa23ba850..a2fd5a4d4cd5 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -549,6 +549,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (recvmsg->type) {
case SMB_DIRECT_MSG_NEGOTIATE_REQ:
+ if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
t->negotiation_requested = true;
t->full_packet_received = true;
wake_up_interruptible(&t->wait_status);
@@ -556,10 +560,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
- int data_length = le32_to_cpu(data_transfer->data_length);
+ unsigned int data_length;
int avail_recvmsg_count, receive_credits;
+ if (wc->byte_len <
+ offsetof(struct smb_direct_data_transfer, padding)) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
+ data_length = le32_to_cpu(data_transfer->data_length);
if (data_length) {
+ if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
+ (u64)data_length) {
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
if (t->full_packet_received)
recvmsg->first_segment = true;
@@ -568,7 +585,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
else
t->full_packet_received = true;
- enqueue_reassembly(t, recvmsg, data_length);
+ enqueue_reassembly(t, recvmsg, (int)data_length);
wake_up_interruptible(&t->wait_reassembly_queue);
spin_lock(&t->receive_credit_lock);
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index b41954294d38..835b384b0895 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1023,7 +1023,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
- int in_count, int *out_count)
+ unsigned int in_count, unsigned int *out_count)
{
struct file *f = fp->filp;
struct inode *inode = file_inode(fp->filp);
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 7b1dcaa3fbdc..b0d5b8feb4a3 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -166,7 +166,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
struct file_allocated_range_buffer;
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
- int in_count, int *out_count);
+ unsigned int in_count, unsigned int *out_count);
int ksmbd_vfs_unlink(struct user_namespace *user_ns,
struct dentry *dir, struct dentry *dentry);
void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..0fca9d680978 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2,117 +2,11 @@
/*
* linux/fs/locks.c
*
- * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- * Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases. For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
*
- * Deadlock detection added.
- * FIXME: one thing isn't handled yet:
- * - mandatory locks (requires lots of changes elsewhere)
- * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- * Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- * Converted file_lock_table to a linked list from an array, which eliminates
- * the limits on how many active file locks are open.
- * Chad Page (pageone@netcom.com), November 27, 1994
- *
- * Removed dependency on file descriptors. dup()'ed file descriptors now
- * get the same locks as the original file descriptors, and a close() on
- * any file descriptor removes ALL the locks on the file for the current
- * process. Since locks still depend on the process id, locks are inherited
- * after an exec() but not after a fork(). This agrees with POSIX, and both
- * BSD and SVR4 practice.
- * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- * Scrapped free list which is redundant now that we allocate locks
- * dynamically with kmalloc()/kfree().
- * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- * Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- * FL_POSIX locks are created with calls to fcntl() and lockf() through the
- * fcntl() system call. They have the semantics described above.
- *
- * FL_FLOCK locks are created with calls to flock(), through the flock()
- * system call, which is new. Old C libraries implement flock() via fcntl()
- * and will continue to use the old, broken implementation.
- *
- * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- * with a file pointer (filp). As a result they can be shared by a parent
- * process and its children after a fork(). They are removed when the last
- * file descriptor referring to the file pointer is closed (unless explicitly
- * unlocked).
- *
- * FL_FLOCK locks never deadlock, an existing lock is always removed before
- * upgrading from shared to exclusive (or vice versa). When this happens
- * any processes blocked by the current lock are woken up and allowed to
- * run before the new lock is applied.
- * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- * Removed some race conditions in flock_lock_file(), marked other possible
- * races. Just grep for FIXME to see them.
- * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- * Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- * once we've checked for blocking and deadlocking.
- * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- * Initial implementation of mandatory locks. SunOS turned out to be
- * a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/filesystems/mandatory-locking.rst' for details.
- * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- * Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- * check if a file has mandatory locks, used by mmap(), open() and creat() to
- * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- * Manual, Section 2.
- * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- * Tidied up block list handling. Added '/proc/locks' interface.
- * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- * Fixed deadlock condition for pathological code that mixes calls to
- * flock() and fcntl().
- * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- * guarantee sensible behaviour in the case where file system modules might
- * be compiled with different options than the kernel itself.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- * (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- * locks. Changed process synchronisation to avoid dereferencing locks that
- * have already been freed.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- * Made the block list a circular list to minimise searching in the list.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- * Made mandatory locking a mount option. Default is not to allow mandatory
- * locking.
- * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- * Some adaptations for NFS support.
- * Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- * Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- * Use slab allocator instead of kmalloc/kfree.
- * Use generic list implementation from <linux/list.h>.
- * Sped up posix_locks_deadlock by only considering blocked locks.
- * Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- * Leases and LOCK_MAND
- * Matthew Wilcox <willy@debian.org>, June, 2000.
- * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
*
* Locking conflicts and dependencies:
* If multiple threads attempt to lock the same byte (or flock the same file)
@@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
}
static inline int flock_translate_cmd(int cmd) {
- if (cmd & LOCK_MAND)
- return cmd & (LOCK_MAND | LOCK_RW);
switch (cmd) {
case LOCK_SH:
return F_RDLCK;
@@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
*/
if (caller_fl->fl_file == sys_fl->fl_file)
return false;
- if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
- return false;
return locks_conflict(caller_fl, sys_fl);
}
@@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
* - %LOCK_SH -- a shared lock.
* - %LOCK_EX -- an exclusive lock.
* - %LOCK_UN -- remove an existing lock.
- * - %LOCK_MAND -- a 'mandatory' flock.
- * This exists to emulate Windows Share Modes.
+ * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
*
- * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- * processes read and write access respectively.
+ * %LOCK_MAND support has been removed from the kernel.
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
@@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
cmd &= ~LOCK_NB;
unlock = (cmd == LOCK_UN);
- if (!unlock && !(cmd & LOCK_MAND) &&
- !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ goto out_putf;
+
+ /*
+ * LOCK_MAND locks were broken for a long time in that they never
+ * conflicted with one another and didn't prevent any sort of open,
+ * read or write activity.
+ *
+ * Just ignore these requests now, to preserve legacy behavior, but
+ * throw a warning to let people know that they don't actually work.
+ */
+ if (cmd & LOCK_MAND) {
+ pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ error = 0;
goto out_putf;
+ }
lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
@@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
struct inode *inode = NULL;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+ int type;
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
@@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
- if (fl->fl_type & LOCK_MAND) {
- seq_puts(f, "FLOCK MSNFS ");
- } else {
- seq_puts(f, "FLOCK ADVISORY ");
- }
+ seq_puts(f, "FLOCK ADVISORY ");
} else if (IS_LEASE(fl)) {
if (fl->fl_flags & FL_DELEG)
seq_puts(f, "DELEG ");
@@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- if (fl->fl_type & LOCK_MAND) {
- seq_printf(f, "%s ",
- (fl->fl_type & LOCK_READ)
- ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
- : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
- } else {
- int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+ type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
- seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
- (type == F_RDLCK) ? "READ" : "UNLCK");
- }
+ seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+ (type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
diff --git a/fs/namei.c b/fs/namei.c
index 1946d9667790..1f9d2187c765 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
int error = get_write_access(inode);
if (error)
return error;
- /*
- * Refuse to truncate files with mandatory locks held on them.
- */
+
error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa353fd58240..24e7dccce355 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /*
- * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
- * any standard. In principle we might be able to support LOCK_MAND
- * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
- * NFS code is not set up for it.
- */
- if (fl->fl_type & LOCK_MAND)
- return -EINVAL;
-
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f1cc8258d34a..5d9ae17bd443 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7045,7 +7045,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
- int ret, i, has_data, num_pages = 0;
+ int ret, has_data, num_pages = 0;
int need_free = 0;
u32 bit_off, num;
handle_t *handle;
@@ -7054,26 +7054,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_alloc_context *data_ac = NULL;
- struct page **pages = NULL;
- loff_t end = osb->s_clustersize;
+ struct page *page = NULL;
struct ocfs2_extent_tree et;
int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
if (has_data) {
- pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
- sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- ret = -ENOMEM;
- mlog_errno(ret);
- return ret;
- }
-
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
- goto free_pages;
+ goto out;
}
}
@@ -7093,7 +7084,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- unsigned int page_end;
+ unsigned int page_end = min_t(unsigned, PAGE_SIZE,
+ osb->s_clustersize);
u64 phys;
ret = dquot_alloc_space_nodirty(inode,
@@ -7117,15 +7109,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
*/
block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- /*
- * Non sparse file systems zero on extend, so no need
- * to do that now.
- */
- if (!ocfs2_sparse_alloc(osb) &&
- PAGE_SIZE < osb->s_clustersize)
- end = PAGE_SIZE;
-
- ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+ ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page,
+ &num_pages);
if (ret) {
mlog_errno(ret);
need_free = 1;
@@ -7136,20 +7121,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* This should populate the 1st page for us and mark
* it up to date.
*/
- ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+ ret = ocfs2_read_inline_data(inode, page, di_bh);
if (ret) {
mlog_errno(ret);
need_free = 1;
goto out_unlock;
}
- page_end = PAGE_SIZE;
- if (PAGE_SIZE > osb->s_clustersize)
- page_end = osb->s_clustersize;
-
- for (i = 0; i < num_pages; i++)
- ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
- pages[i], i > 0, &phys);
+ ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0,
+ &phys);
}
spin_lock(&oi->ip_lock);
@@ -7180,8 +7160,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
out_unlock:
- if (pages)
- ocfs2_unlock_and_free_pages(pages, num_pages);
+ if (page)
+ ocfs2_unlock_and_free_pages(&page, num_pages);
out_commit:
if (ret < 0 && did_quota)
@@ -7205,8 +7185,6 @@ out_commit:
out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
-free_pages:
- kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8521942f5af2..481017e1dac5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct journal_head *jh;
- int ret;
+ int ret = 1;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
if (!buffer_jbd(bg_bh))
return 1;
- jh = bh2jh(bg_bh);
- spin_lock(&jh->b_state_lock);
- bg = (struct ocfs2_group_desc *) jh->b_committed_data;
- if (bg)
- ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
- else
- ret = 1;
- spin_unlock(&jh->b_state_lock);
+ jbd_lock_bh_journal_head(bg_bh);
+ if (buffer_jbd(bg_bh)) {
+ jh = bh2jh(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ spin_unlock(&jh->b_state_lock);
+ }
+ jbd_unlock_bh_journal_head(bg_bh);
return ret;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c86bd4e60e20..5c914ce9b3ac 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2167,11 +2167,17 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
if (ocfs2_clusterinfo_valid(osb)) {
+ /*
+ * ci_stack and ci_cluster in ocfs2_cluster_info may not be null
+ * terminated, so make sure no overflow happens here by using
+ * memcpy. Destination strings will always be null terminated
+ * because osb is allocated using kzalloc.
+ */
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
- strlcpy(osb->osb_cluster_stack,
+ memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN + 1);
+ OCFS2_STACK_LABEL_LEN);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2180,9 +2186,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
- strlcpy(osb->osb_cluster_name,
+ memcpy(osb->osb_cluster_name,
OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
- OCFS2_CLUSTER_NAME_LEN + 1);
+ OCFS2_CLUSTER_NAME_LEN);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
diff --git a/fs/read_write.c b/fs/read_write.c
index af057c57bdc6..0074afa7ecb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely((ssize_t) count < 0))
return -EINVAL;
- /*
- * ranged mandatory locking does not apply to streams - it makes sense
- * only for files where position has a meaning.
- */
if (ppos) {
loff_t pos = *ppos;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 003f0d31743e..22bf14ab2d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (mode_wp && mode_dontwake)
return -EINVAL;
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp,
+ &ctx->mmap_changing);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
if (ret)
return ret;