summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cell.c3
-rw-r--r--fs/afs/dir.c12
-rw-r--r--fs/afs/dir_edit.c6
-rw-r--r--fs/afs/file.c78
-rw-r--r--fs/afs/internal.h57
-rw-r--r--fs/afs/write.c105
-rw-r--r--fs/afs/xattr.c9
-rw-r--r--fs/afs/yfsclient.c1
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/btrfs/backref.c13
-rw-r--r--fs/btrfs/block-group.c1
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c139
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/qgroup.c18
-rw-r--r--fs/btrfs/reada.c47
-rw-r--r--fs/btrfs/tree-checker.c18
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/cachefiles/rdwr.c3
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/mds_client.c50
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/quota.c2
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/debugfs/file.c15
-rw-r--r--fs/ext4/dir.c64
-rw-r--r--fs/ext4/ext4.h20
-rw-r--r--fs/ext4/extents.c30
-rw-r--r--fs/ext4/fast_commit.c37
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/ext4/namei.c20
-rw-r--r--fs/ext4/super.c16
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/glops.h1
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/lops.c31
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/ops_fstype.c14
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/io-wq.c4
-rw-r--r--fs/io_uring.c291
-rw-r--r--fs/iomap/buffered-io.c30
-rw-r--r--fs/isofs/rock.h8
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/cpuinfo.c2
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/select.c4
-rw-r--r--fs/seq_file.c45
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c1
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/scrub/inode.c3
-rw-r--r--fs/xfs/xfs_aops.c20
-rw-r--r--fs/xfs/xfs_iops.c10
-rw-r--r--fs/xfs/xfs_reflink.c3
68 files changed, 866 insertions, 519 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 52233fa6195f..887b673f6223 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -589,7 +589,7 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
{
- unsigned int debug_id = cell->debug_id;
+ unsigned int debug_id;
time64_t now, expire_delay;
int u, a;
@@ -604,6 +604,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
if (cell->vl_servers->nr_servers)
expire_delay = afs_cell_gc_delay;
+ debug_id = cell->debug_id;
u = atomic_read(&cell->ref);
a = atomic_dec_return(&cell->active);
trace_afs_cell(debug_id, u, a, reason);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 1d2e61e0ab04..1bb5b9d7f0a2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -281,8 +281,7 @@ retry:
if (ret < 0)
goto error;
- set_page_private(req->pages[i], 1);
- SetPagePrivate(req->pages[i]);
+ attach_page_private(req->pages[i], (void *)1);
unlock_page(req->pages[i]);
i++;
} else {
@@ -1975,8 +1974,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)
_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
/* The directory will need reloading. */
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
@@ -2003,8 +2001,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
afs_stat_v(dvnode, n_inval);
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_SIZE) {
- set_page_private(page, 0);
- ClearPagePrivate(page);
- }
+ if (offset == 0 && length == PAGE_SIZE)
+ detach_page_private(page);
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index b108528bf010..2ffe09abae7f 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -243,10 +243,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
index, gfp);
if (!page)
goto error;
- if (!PagePrivate(page)) {
- set_page_private(page, 1);
- SetPagePrivate(page);
- }
+ if (!PagePrivate(page))
+ attach_page_private(page, (void *)1);
dir_page = kmap(page);
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 371d1488cc54..85f5adf21aa0 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -33,6 +33,7 @@ const struct file_operations afs_file_operations = {
.write_iter = afs_file_write,
.mmap = afs_file_mmap,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.fsync = afs_fsync,
.lock = afs_lock,
.flock = afs_flock,
@@ -601,6 +602,63 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
}
/*
+ * Adjust the dirty region of the page on truncation or full invalidation,
+ * getting rid of the markers altogether if the region is entirely invalidated.
+ */
+static void afs_invalidate_dirty(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ unsigned long priv;
+ unsigned int f, t, end = offset + length;
+
+ priv = page_private(page);
+
+ /* we clean up only if the entire page is being invalidated */
+ if (offset == 0 && length == thp_size(page))
+ goto full_invalidate;
+
+ /* If the page was dirtied by page_mkwrite(), the PTE stays writable
+ * and we don't get another notification to tell us to expand it
+ * again.
+ */
+ if (afs_is_page_dirty_mmapped(priv))
+ return;
+
+ /* We may need to shorten the dirty region */
+ f = afs_page_dirty_from(priv);
+ t = afs_page_dirty_to(priv);
+
+ if (t <= offset || f >= end)
+ return; /* Doesn't overlap */
+
+ if (f < offset && t > end)
+ return; /* Splits the dirty region - just absorb it */
+
+ if (f >= offset && t <= end)
+ goto undirty;
+
+ if (f < offset)
+ t = offset;
+ else
+ f = end;
+ if (f == t)
+ goto undirty;
+
+ priv = afs_page_dirty(f, t);
+ set_page_private(page, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv);
+ return;
+
+undirty:
+ trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv);
+ clear_page_dirty_for_io(page);
+full_invalidate:
+ priv = (unsigned long)detach_page_private(page);
+ trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv);
+}
+
+/*
* invalidate part or all of a page
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
@@ -608,31 +666,23 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- unsigned long priv;
-
_enter("{%lu},%u,%u", page->index, offset, length);
BUG_ON(!PageLocked(page));
+#ifdef CONFIG_AFS_FSCACHE
/* we clean up only if the entire page is being invalidated */
if (offset == 0 && length == PAGE_SIZE) {
-#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
fscache_wait_on_page_write(vnode->cache, page);
fscache_uncache_page(vnode->cache, page);
}
+ }
#endif
- if (PagePrivate(page)) {
- priv = page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("inval"),
- page->index, priv);
- set_page_private(page, 0);
- ClearPagePrivate(page);
- }
- }
+ if (PagePrivate(page))
+ afs_invalidate_dirty(page, offset, length);
_leave("");
}
@@ -660,11 +710,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
#endif
if (PagePrivate(page)) {
- priv = page_private(page);
+ priv = (unsigned long)detach_page_private(page);
trace_afs_page_dirty(vnode, tracepoint_string("rel"),
page->index, priv);
- set_page_private(page, 0);
- ClearPagePrivate(page);
}
/* indicate that the page can be released */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 81b0485fd22a..14d5d75f4b6e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -812,6 +812,7 @@ struct afs_operation {
pgoff_t last; /* last page in mapping to deal with */
unsigned first_offset; /* offset into mapping[first] */
unsigned last_to; /* amount of mapping[last] */
+ bool laundering; /* Laundering page, PG_writeback not set */
} store;
struct {
struct iattr *attr;
@@ -857,6 +858,62 @@ struct afs_vnode_cache_aux {
u64 data_version;
} __packed;
+/*
+ * We use page->private to hold the amount of the page that we've written to,
+ * splitting the field into two parts. However, we need to represent a range
+ * 0...PAGE_SIZE, so we reduce the resolution if the size of the page
+ * exceeds what we can encode.
+ */
+#ifdef CONFIG_64BIT
+#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL
+#define __AFS_PAGE_PRIV_SHIFT 32
+#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL
+#else
+#define __AFS_PAGE_PRIV_MASK 0x7fffUL
+#define __AFS_PAGE_PRIV_SHIFT 16
+#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL
+#endif
+
+static inline unsigned int afs_page_dirty_resolution(void)
+{
+ int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+ return (shift > 0) ? shift : 0;
+}
+
+static inline size_t afs_page_dirty_from(unsigned long priv)
+{
+ unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
+
+ /* The lower bound is inclusive */
+ return x << afs_page_dirty_resolution();
+}
+
+static inline size_t afs_page_dirty_to(unsigned long priv)
+{
+ unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK;
+
+ /* The upper bound is immediately beyond the region */
+ return (x + 1) << afs_page_dirty_resolution();
+}
+
+static inline unsigned long afs_page_dirty(size_t from, size_t to)
+{
+ unsigned int res = afs_page_dirty_resolution();
+ from >>= res;
+ to = (to - 1) >> res;
+ return (to << __AFS_PAGE_PRIV_SHIFT) | from;
+}
+
+static inline unsigned long afs_page_dirty_mmapped(unsigned long priv)
+{
+ return priv | __AFS_PAGE_PRIV_MMAPPED;
+}
+
+static inline bool afs_is_page_dirty_mmapped(unsigned long priv)
+{
+ return priv & __AFS_PAGE_PRIV_MMAPPED;
+}
+
#include <trace/events/afs.h>
/*****************************************************************************/
diff --git a/fs/afs/write.c b/fs/afs/write.c
index da12abd6db21..50371207f327 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -76,7 +76,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ struct page **_page, void **fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct page *page;
@@ -90,11 +90,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
_enter("{%llx:%llu},{%lx},%u,%u",
vnode->fid.vid, vnode->fid.vnode, index, from, to);
- /* We want to store information about how much of a page is altered in
- * page->private.
- */
- BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8);
-
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -110,9 +105,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
SetPageUptodate(page);
}
- /* page won't leak in error case: it eventually gets cleaned off LRU */
- *pagep = page;
-
try_again:
/* See if this page is already partially written in a way that we can
* merge the new write with.
@@ -120,8 +112,8 @@ try_again:
t = f = 0;
if (PagePrivate(page)) {
priv = page_private(page);
- f = priv & AFS_PRIV_MAX;
- t = priv >> AFS_PRIV_SHIFT;
+ f = afs_page_dirty_from(priv);
+ t = afs_page_dirty_to(priv);
ASSERTCMP(f, <=, t);
}
@@ -138,21 +130,9 @@ try_again:
if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&
(to < f || from > t))
goto flush_conflicting_write;
- if (from < f)
- f = from;
- if (to > t)
- t = to;
- } else {
- f = from;
- t = to;
}
- priv = (unsigned long)t << AFS_PRIV_SHIFT;
- priv |= f;
- trace_afs_page_dirty(vnode, tracepoint_string("begin"),
- page->index, priv);
- SetPagePrivate(page);
- set_page_private(page, priv);
+ *_page = page;
_leave(" = 0");
return 0;
@@ -162,17 +142,18 @@ try_again:
flush_conflicting_write:
_debug("flush conflict");
ret = write_one_page(page);
- if (ret < 0) {
- _leave(" = %d", ret);
- return ret;
- }
+ if (ret < 0)
+ goto error;
ret = lock_page_killable(page);
- if (ret < 0) {
- _leave(" = %d", ret);
- return ret;
- }
+ if (ret < 0)
+ goto error;
goto try_again;
+
+error:
+ put_page(page);
+ _leave(" = %d", ret);
+ return ret;
}
/*
@@ -184,6 +165,9 @@ int afs_write_end(struct file *file, struct address_space *mapping,
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct key *key = afs_file_key(file);
+ unsigned long priv;
+ unsigned int f, from = pos & (PAGE_SIZE - 1);
+ unsigned int t, to = from + copied;
loff_t i_size, maybe_i_size;
int ret;
@@ -215,6 +199,25 @@ int afs_write_end(struct file *file, struct address_space *mapping,
SetPageUptodate(page);
}
+ if (PagePrivate(page)) {
+ priv = page_private(page);
+ f = afs_page_dirty_from(priv);
+ t = afs_page_dirty_to(priv);
+ if (from < f)
+ f = from;
+ if (to > t)
+ t = to;
+ priv = afs_page_dirty(f, t);
+ set_page_private(page, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("dirty+"),
+ page->index, priv);
+ } else {
+ priv = afs_page_dirty(from, to);
+ attach_page_private(page, (void *)priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("dirty"),
+ page->index, priv);
+ }
+
set_page_dirty(page);
if (PageDirty(page))
_debug("dirtied");
@@ -334,10 +337,9 @@ static void afs_pages_written_back(struct afs_vnode *vnode,
ASSERTCMP(pv.nr, ==, count);
for (loop = 0; loop < count; loop++) {
- priv = page_private(pv.pages[loop]);
+ priv = (unsigned long)detach_page_private(pv.pages[loop]);
trace_afs_page_dirty(vnode, tracepoint_string("clear"),
pv.pages[loop]->index, priv);
- set_page_private(pv.pages[loop], 0);
end_page_writeback(pv.pages[loop]);
}
first += count;
@@ -396,7 +398,8 @@ static void afs_store_data_success(struct afs_operation *op)
op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
if (op->error == 0) {
- afs_pages_written_back(vnode, op->store.first, op->store.last);
+ if (!op->store.laundering)
+ afs_pages_written_back(vnode, op->store.first, op->store.last);
afs_stat_v(vnode, n_stores);
atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) -
(op->store.first * PAGE_SIZE + op->store.first_offset),
@@ -415,7 +418,7 @@ static const struct afs_operation_ops afs_store_data_operation = {
*/
static int afs_store_data(struct address_space *mapping,
pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to)
+ unsigned offset, unsigned to, bool laundering)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct afs_operation *op;
@@ -448,6 +451,7 @@ static int afs_store_data(struct address_space *mapping,
op->store.last = last;
op->store.first_offset = offset;
op->store.last_to = to;
+ op->store.laundering = laundering;
op->mtime = vnode->vfs_inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
@@ -509,8 +513,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
*/
start = primary_page->index;
priv = page_private(primary_page);
- offset = priv & AFS_PRIV_MAX;
- to = priv >> AFS_PRIV_SHIFT;
+ offset = afs_page_dirty_from(priv);
+ to = afs_page_dirty_to(priv);
trace_afs_page_dirty(vnode, tracepoint_string("store"),
primary_page->index, priv);
@@ -555,8 +559,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
}
priv = page_private(page);
- f = priv & AFS_PRIV_MAX;
- t = priv >> AFS_PRIV_SHIFT;
+ f = afs_page_dirty_from(priv);
+ t = afs_page_dirty_to(priv);
if (f != 0 &&
!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) {
unlock_page(page);
@@ -601,7 +605,7 @@ no_more:
if (end > i_size)
to = i_size & ~PAGE_MASK;
- ret = afs_store_data(mapping, first, last, offset, to);
+ ret = afs_store_data(mapping, first, last, offset, to, false);
switch (ret) {
case 0:
ret = count;
@@ -857,12 +861,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
*/
wait_on_page_writeback(vmf->page);
- priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */
- priv |= 0; /* From */
+ priv = afs_page_dirty(0, PAGE_SIZE);
+ priv = afs_page_dirty_mmapped(priv);
trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),
vmf->page->index, priv);
- SetPagePrivate(vmf->page);
- set_page_private(vmf->page, priv);
+ if (PagePrivate(vmf->page))
+ set_page_private(vmf->page, priv);
+ else
+ attach_page_private(vmf->page, (void *)priv);
file_update_time(file);
sb_end_pagefault(inode->i_sb);
@@ -915,19 +921,18 @@ int afs_launder_page(struct page *page)
f = 0;
t = PAGE_SIZE;
if (PagePrivate(page)) {
- f = priv & AFS_PRIV_MAX;
- t = priv >> AFS_PRIV_SHIFT;
+ f = afs_page_dirty_from(priv);
+ t = afs_page_dirty_to(priv);
}
trace_afs_page_dirty(vnode, tracepoint_string("launder"),
page->index, priv);
- ret = afs_store_data(mapping, page->index, page->index, t, f);
+ ret = afs_store_data(mapping, page->index, page->index, t, f, true);
}
+ priv = (unsigned long)detach_page_private(page);
trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
page->index, priv);
- set_page_private(page, 0);
- ClearPagePrivate(page);
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 84f3c4f57531..95c573dcda11 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -85,7 +85,7 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler,
if (acl->size <= size)
memcpy(buffer, acl->data, acl->size);
else
- op->error = -ERANGE;
+ ret = -ERANGE;
}
}
@@ -148,11 +148,6 @@ static const struct xattr_handler afs_xattr_afs_acl_handler = {
.set = afs_xattr_set_acl,
};
-static void yfs_acl_put(struct afs_operation *op)
-{
- yfs_free_opaque_acl(op->yacl);
-}
-
static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = {
.issue_yfs_rpc = yfs_fs_fetch_opaque_acl,
.success = afs_acl_success,
@@ -246,7 +241,7 @@ error:
static const struct afs_operation_ops yfs_store_opaque_acl2_operation = {
.issue_yfs_rpc = yfs_fs_store_opaque_acl2,
.success = afs_acl_success,
- .put = yfs_acl_put,
+ .put = afs_acl_put,
};
/*
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 3b1239b7e90d..bd787e71a657 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1990,6 +1990,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op)
memcpy(bp, acl->data, acl->size);
if (acl->size != size)
memset((void *)bp + acl->size, 0, size - acl->size);
+ bp += size / sizeof(__be32);
yfs_check_req(call, bp);
trace_afs_make_fs_call(call, &vp->fid);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b6b3d052ca86..fa50e8936f5f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1690,7 +1690,7 @@ struct elf_thread_core_info {
struct elf_thread_core_info *next;
struct task_struct *task;
struct elf_prstatus prstatus;
- struct memelfnote notes[0];
+ struct memelfnote notes[];
};
struct elf_note_info {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b3268f4ea5f3..771a036867dc 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -544,7 +544,18 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int level = ref->level;
struct btrfs_key search_key = ref->key_for_search;
- root = btrfs_get_fs_root(fs_info, ref->root_id, false);
+ /*
+ * If we're search_commit_root we could possibly be holding locks on
+ * other tree nodes. This happens when qgroups does backref walks when
+ * adding new delayed refs. To deal with this we need to look in cache
+ * for the root, and if we don't find it then we need to search the
+ * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
+ * here.
+ */
+ if (path->search_commit_root)
+ root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ else
+ root = btrfs_get_fs_root(fs_info, ref->root_id, false);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out_free;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c0f1d6818df7..3ba6f3839d39 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2024,6 +2024,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
key.offset = 0;
btrfs_release_path(path);
}
+ btrfs_release_path(path);
list_for_each_entry(space_info, &info->space_info, list) {
int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aac3d6f4e35b..0378933d163c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3564,6 +3564,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct extent_buffer *eb, int err);
+void btrfs_reada_remove_dev(struct btrfs_device *dev);
+void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 4a0243cb9d97..5b9e3f3ace22 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -688,6 +688,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ if (!scrub_ret)
+ btrfs_reada_remove_dev(src_device);
+
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
@@ -696,6 +699,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
+ btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
@@ -746,6 +750,7 @@ error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_reada_undo_remove_dev(src_device);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e3438672a82..af97ddcc6b3e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1281,32 +1281,26 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
- struct btrfs_path *path;
u64 generation;
int ret;
int level;
- path = btrfs_alloc_path();
- if (!path)
- return ERR_PTR(-ENOMEM);
-
root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
- if (!root) {
- ret = -ENOMEM;
- goto alloc_fail;
- }
+ if (!root)
+ return ERR_PTR(-ENOMEM);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto find_fail;
+ goto fail;
}
generation = btrfs_root_generation(&root->root_item);
@@ -1317,21 +1311,31 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
- goto find_fail;
+ goto fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto find_fail;
+ goto fail;
}
root->commit_root = btrfs_root_node(root);
-out:
- btrfs_free_path(path);
return root;
-
-find_fail:
+fail:
btrfs_put_root(root);
-alloc_fail:
- root = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
+}
+
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ root = read_tree_root_path(tree_root, path, key);
+ btrfs_free_path(path);
+
+ return root;
}
/*
@@ -1419,6 +1423,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
return root;
}
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->tree_root);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->extent_root);
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->chunk_root);
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->dev_root);
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->csum_root);
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->free_space_root) ?
+ fs_info->free_space_root : ERR_PTR(-ENOENT);
+ return NULL;
+}
+
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
@@ -1518,25 +1547,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int ret;
- if (objectid == BTRFS_ROOT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->tree_root);
- if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->extent_root);
- if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->chunk_root);
- if (objectid == BTRFS_DEV_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->dev_root);
- if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->csum_root);
- if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->quota_root) ?
- fs_info->quota_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_UUID_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->uuid_root) ?
- fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->free_space_root) ?
- fs_info->free_space_root : ERR_PTR(-ENOENT);
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
@@ -1622,6 +1635,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
}
/*
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * @fs_info: the fs_info
+ * @objectid: the objectid we need to lookup
+ *
+ * This is exclusively used for backref walking, and exists specifically because
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
+ * creation time, which means we may have to read the tree_root in order to look
+ * up a fs root that is not in memory. If the root is not in memory we will
+ * read the tree root commit root and look up the fs root from there. This is a
+ * temporary root, it will not be inserted into the radix tree as it doesn't
+ * have the most uptodate information, it'll simply be discarded once the
+ * backref code is finished using the root.
+ */
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ ASSERT(path->search_commit_root && path->skip_locking);
+
+ /*
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
+ * since this is called via the backref walking code we won't be looking
+ * up a root that doesn't exist, unless there's corruption. So if root
+ * != NULL just return it.
+ */
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
+ btrfs_release_path(path);
+
+ return root;
+}
+
+/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
*/
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index fee69ced58b4..182540bdcea0 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -69,6 +69,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev);
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b21fee13e77..5fd60b13f4f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3185,7 +3185,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_tree_block_info *bi;
if (item_size < sizeof(*ei) + sizeof(*bi)) {
btrfs_crit(info,
-"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
owner_objectid, item_size,
sizeof(*ei) + sizeof(*bi));
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0ff659455b1e..87355a38a654 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3628,7 +3628,8 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
ret = btrfs_direct_IO(iocb, to);
inode_unlock_shared(inode);
- if (ret < 0)
+ if (ret < 0 || !iov_iter_count(to) ||
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 936c3137c646..da58c58ef9aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9672,10 +9672,16 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
* clear_offset by our extent size.
*/
clear_offset += ins.offset;
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ /*
+ * Now that we inserted the prealloc extent we can finally
+ * decrement the number of reservations in the block group.
+ * If we did it before, we could race with relocation and have
+ * relocation miss the reserved extent, making it fail later.
+ */
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 580899bdb991..c54ea6586632 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1026,6 +1026,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_ROOT_REF_KEY) {
+
+ /* Release locks on tree_root before we access quota_root */
+ btrfs_release_path(path);
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -1044,6 +1048,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
+ path, 1, 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ if (ret > 0) {
+ /*
+ * Shouldn't happen, but in case it does we
+ * don't need to do the btrfs_next_item, just
+ * continue.
+ */
+ continue;
+ }
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 9d4f5316a7e8..d9a166eb344e 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -421,6 +421,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!dev->bdev)
continue;
+ if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
+ continue;
+
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
/*
@@ -445,6 +448,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
}
have_zone = 1;
}
+ if (!have_zone)
+ radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
up_read(&fs_info->dev_replace.rwsem);
@@ -1020,3 +1025,45 @@ void btrfs_reada_detach(void *handle)
kref_put(&rc->refcnt, reada_control_release);
}
+
+/*
+ * Before removing a device (device replace or device remove ioctls), call this
+ * function to wait for all existing readahead requests on the device and to
+ * make sure no one queues more readahead requests for the device.
+ *
+ * Must be called without holding neither the device list mutex nor the device
+ * replace semaphore, otherwise it will deadlock.
+ */
+void btrfs_reada_remove_dev(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+
+ /* Serialize with readahead extent creation at reada_find_extent(). */
+ spin_lock(&fs_info->reada_lock);
+ set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
+ spin_unlock(&fs_info->reada_lock);
+
+ /*
+ * There might be readahead requests added to the radix trees which
+ * were not yet added to the readahead work queue. We need to start
+ * them and wait for their completion, otherwise we can end up with
+ * use-after-free problems when dropping the last reference on the
+ * readahead extents and their zones, as they need to access the
+ * device structure.
+ */
+ reada_start_machine(fs_info);
+ btrfs_flush_workqueue(fs_info->readahead_workers);
+}
+
+/*
+ * If when removing a device (device replace or device remove ioctls) an error
+ * happens after calling btrfs_reada_remove_dev(), call this to undo what that
+ * function did. This is safe to call even if btrfs_reada_remove_dev() was not
+ * called before.
+ */
+void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
+{
+ spin_lock(&dev->fs_info->reada_lock);
+ clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
+ spin_unlock(&dev->fs_info->reada_lock);
+}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index f0ffd5ee77bd..8784b74f5232 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
u64 type;
u64 features;
bool mixed = false;
+ int raid_index;
+ int nparity;
+ int ncopies;
length = btrfs_chunk_length(leaf, chunk);
stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
+ raid_index = btrfs_bg_flags_to_raid_index(type);
+ ncopies = btrfs_raid_array[raid_index].ncopies;
+ nparity = btrfs_raid_array[raid_index].nparity;
if (!num_stripes) {
chunk_err(leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
+ if (num_stripes < ncopies) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes < ncopies, have %u < %d",
+ num_stripes, ncopies);
+ return -EUCLEAN;
+ }
+ if (nparity && num_stripes == nparity) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes == nparity, have %u == %d",
+ num_stripes, nparity);
+ return -EUCLEAN;
+ }
if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
chunk_err(leaf, chunk, logical,
"invalid chunk logical, have %llu should aligned to %u",
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 58b9c419a2b6..b1e48078c318 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -431,7 +431,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- btrfs_device_data_ordered_init(dev);
+ btrfs_device_data_ordered_init(dev, fs_info);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, &dev->alloc_state,
@@ -2099,6 +2099,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
+ if (!ret)
+ btrfs_reada_remove_dev(device);
mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2179,6 +2181,7 @@ out:
return ret;
error_undo:
+ btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bf27ac07d315..232f02bd214f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -39,10 +39,10 @@ struct btrfs_io_geometry {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __BTRFS_NEED_DEVICE_DATA_ORDERED
-#define btrfs_device_data_ordered_init(device) \
- seqcount_init(&device->data_seqcount)
+#define btrfs_device_data_ordered_init(device, info) \
+ seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex)
#else
-#define btrfs_device_data_ordered_init(device) do { } while (0)
+#define btrfs_device_data_ordered_init(device, info) do { } while (0)
#endif
#define BTRFS_DEV_STATE_WRITEABLE (0)
@@ -50,6 +50,7 @@ struct btrfs_io_geometry {
#define BTRFS_DEV_STATE_MISSING (2)
#define BTRFS_DEV_STATE_REPLACE_TGT (3)
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
+#define BTRFS_DEV_STATE_NO_READA (5)
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
@@ -71,7 +72,8 @@ struct btrfs_device {
blk_status_t last_flush_error;
#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
- seqcount_t data_seqcount;
+ /* A seqcount_t with associated chunk_mutex (for lockdep) */
+ seqcount_mutex_t data_seqcount;
#endif
/* the internal btrfs device id */
@@ -162,11 +164,9 @@ btrfs_device_get_##name(const struct btrfs_device *dev) \
static inline void \
btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
{ \
- preempt_disable(); \
write_seqcount_begin(&dev->data_seqcount); \
dev->name = size; \
write_seqcount_end(&dev->data_seqcount); \
- preempt_enable(); \
}
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
#define BTRFS_DEVICE_GETSET_FUNCS(name) \
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3080cda9e824..8bda092e60c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -121,7 +121,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
_debug("reissue read");
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
- goto unlock_discard;
+ goto discard;
}
/* but the page may have been read before the monitor was installed, so
@@ -138,6 +138,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
unlock_discard:
unlock_page(backpage);
+discard:
spin_lock_irq(&object->work_lock);
list_del(&monitor->op_link);
spin_unlock_irq(&object->work_lock);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5027bbdca419..ded4229c314a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4074,7 +4074,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
vino.snap, inode);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 08f1c0c31dc2..8f1d7500a7ec 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4231,7 +4231,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dname.len, dname.name);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
if (!inode) {
dout("handle_lease no inode %llx\n", vino.ino);
@@ -4385,29 +4385,49 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(s);
- return false;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+ switch (s->s_state) {
+ case CEPH_MDS_SESSION_OPEN:
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
s->s_state = CEPH_MDS_SESSION_HUNG;
pr_info("mds%d hung\n", s->s_mds);
}
- }
- if (s->s_state == CEPH_MDS_SESSION_NEW ||
- s->s_state == CEPH_MDS_SESSION_RESTARTING ||
- s->s_state == CEPH_MDS_SESSION_CLOSED ||
- s->s_state == CEPH_MDS_SESSION_REJECTED)
- /* this mds is failed or recovering, just wait */
+ break;
+ case CEPH_MDS_SESSION_CLOSING:
+ /* Should never reach this when we're unmounting */
+ WARN_ON_ONCE(true);
+ fallthrough;
+ case CEPH_MDS_SESSION_NEW:
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_CLOSED:
+ case CEPH_MDS_SESSION_REJECTED:
return false;
+ }
return true;
}
/*
+ * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
+ * then we need to retransmit that request.
+ */
+void inc_session_sequence(struct ceph_mds_session *s)
+{
+ lockdep_assert_held(&s->s_mutex);
+
+ s->s_seq++;
+
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ int ret;
+
+ dout("resending session close request for mds%d\n", s->s_mds);
+ ret = request_close_session(s);
+ if (ret < 0)
+ pr_err("unable to close session to mds%d: %d\n",
+ s->s_mds, ret);
+ }
+}
+
+/*
* delayed work -- periodically trim expired leases, renew caps with mds
*/
static void schedule_delayed(struct ceph_mds_client *mdsc)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index cbf8af437140..f5adbebcb38e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -480,6 +480,7 @@ struct ceph_mds_client {
extern const char *ceph_mds_op_name(int op);
extern bool check_session_state(struct ceph_mds_session *s);
+void inc_session_sequence(struct ceph_mds_session *s);
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 83cb4f26b689..9b785f11e95a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -53,7 +53,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
/* increment msg sequence number */
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
mutex_unlock(&session->s_mutex);
/* lookup inode */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 0da39c16dab4..b611f829cb61 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -873,7 +873,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
mutex_unlock(&session->s_mutex);
down_write(&mdsc->snap_rwsem);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index a768a09430c3..686e0ad28788 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1127,24 +1127,23 @@ static const struct file_operations debugfs_devm_entry_ops = {
* file will be created in the root of the debugfs filesystem.
* @read_fn: function pointer called to print the seq_file content.
*/
-struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
- struct dentry *parent,
- int (*read_fn)(struct seq_file *s,
- void *data))
+void debugfs_create_devm_seqfile(struct device *dev, const char *name,
+ struct dentry *parent,
+ int (*read_fn)(struct seq_file *s, void *data))
{
struct debugfs_devm_entry *entry;
if (IS_ERR(parent))
- return ERR_PTR(-ENOENT);
+ return;
entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL);
if (!entry)
- return ERR_PTR(-ENOMEM);
+ return;
entry->read = read_fn;
entry->dev = dev;
- return debugfs_create_file(name, S_IRUGO, parent, entry,
- &debugfs_devm_entry_ops);
+ debugfs_create_file(name, S_IRUGO, parent, entry,
+ &debugfs_devm_entry_ops);
}
EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 5b81f3b080ee..ca50c90adc4c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -669,68 +669,8 @@ const struct file_operations ext4_dir_operations = {
};
#ifdef CONFIG_UNICODE
-static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
-{
- struct qstr qstr = {.name = str, .len = len };
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *inode = d_inode_rcu(parent);
- char strbuf[DNAME_INLINE_LEN];
-
- if (!inode || !IS_CASEFOLDED(inode) ||
- !EXT4_SB(inode->i_sb)->s_encoding) {
- if (len != name->len)
- return -1;
- return memcmp(str, name->name, len);
- }
-
- /*
- * If the dentry name is stored in-line, then it may be concurrently
- * modified by a rename. If this happens, the VFS will eventually retry
- * the lookup, so it doesn't matter what ->d_compare() returns.
- * However, it's unsafe to call utf8_strncasecmp() with an unstable
- * string. Therefore, we have to copy the name into a temporary buffer.
- */
- if (len <= DNAME_INLINE_LEN - 1) {
- memcpy(strbuf, str, len);
- strbuf[len] = 0;
- qstr.name = strbuf;
- /* prevent compiler from optimizing out the temporary buffer */
- barrier();
- }
-
- return ext4_ci_compare(inode, name, &qstr, false);
-}
-
-static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
-{
- const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
- const struct unicode_map *um = sbi->s_encoding;
- const struct inode *inode = d_inode_rcu(dentry);
- unsigned char *norm;
- int len, ret = 0;
-
- if (!inode || !IS_CASEFOLDED(inode) || !um)
- return 0;
-
- norm = kmalloc(PATH_MAX, GFP_ATOMIC);
- if (!norm)
- return -ENOMEM;
-
- len = utf8_casefold(um, str, norm, PATH_MAX);
- if (len < 0) {
- if (ext4_has_strict_mode(sbi))
- ret = -EINVAL;
- goto out;
- }
- str->hash = full_name_hash(dentry, norm, len);
-out:
- kfree(norm);
- return ret;
-}
-
const struct dentry_operations ext4_dentry_ops = {
- .d_hash = ext4_d_hash,
- .d_compare = ext4_d_compare,
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
};
#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 254d1c26bea8..45fcdbf538d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1166,10 +1166,6 @@ struct ext4_inode_info {
#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT4_ERROR_FS 0x0002 /* Errors detected */
#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
-#define EXT4_FC_INELIGIBLE 0x0008 /* Fast commit ineligible */
-#define EXT4_FC_COMMITTING 0x0010 /* File system underoing a fast
- * commit.
- */
#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */
/*
@@ -1431,6 +1427,10 @@ struct ext4_super_block {
*/
#define EXT4_MF_MNTDIR_SAMPLED 0x0001
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+#define EXT4_MF_FC_INELIGIBLE 0x0004 /* Fast commit ineligible */
+#define EXT4_MF_FC_COMMITTING 0x0008 /* File system underoing a fast
+ * commit.
+ */
#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
@@ -1444,14 +1444,6 @@ struct ext4_super_block {
#define EXT4_ENC_UTF8_12_1 1
/*
- * Flags for ext4_sb_info.s_encoding_flags.
- */
-#define EXT4_ENC_STRICT_MODE_FL (1 << 0)
-
-#define ext4_has_strict_mode(sbi) \
- (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
-
-/*
* fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
@@ -1500,10 +1492,6 @@ struct ext4_sb_info {
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
-#ifdef CONFIG_UNICODE
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
/* Journaling */
struct journal_s *s_journal;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 559100f3e23c..57cfa28919a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1471,16 +1471,16 @@ static int ext4_ext_search_left(struct inode *inode,
}
/*
- * search the closest allocated block to the right for *logical
- * and returns it at @logical + it's physical address at @phys
- * if *logical is the largest allocated block, the function
- * returns 0 at @phys
- * return value contains 0 (success) or error code
+ * Search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys.
+ * If not exists, return 0 and @phys is set to 0. We will return
+ * 1 which means we found an allocated block and ret_ex is valid.
+ * Or return a (< 0) error code.
*/
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent **ret_ex)
+ struct ext4_extent *ret_ex)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1574,10 +1574,11 @@ got_index:
found_extent:
*logical = le32_to_cpu(ex->ee_block);
*phys = ext4_ext_pblock(ex);
- *ret_ex = ex;
+ if (ret_ex)
+ *ret_ex = *ex;
if (bh)
put_bh(bh);
- return 0;
+ return 1;
}
/*
@@ -2868,8 +2869,8 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- &ex);
- if (err)
+ NULL);
+ if (err < 0)
goto out;
if (pblk) {
partial.pclu = EXT4_B2C(sbi, pblk);
@@ -4039,7 +4040,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent newex, *ex, *ex2;
+ struct ext4_extent newex, *ex, ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0, pblk;
int err = 0, depth, ret;
@@ -4175,15 +4176,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- ex2 = NULL;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
- if (err)
+ if (err < 0)
goto out;
/* Check if the extent after searching to the right implies a
* cluster we can use. */
- if ((sbi->s_cluster_ratio > 1) && ex2 &&
- get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ if ((sbi->s_cluster_ratio > 1) && err &&
+ get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
goto got_allocated_blocks;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 447c8d93f480..8d43058386c3 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -269,7 +269,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
- sbi->s_mount_state |= EXT4_FC_INELIGIBLE;
+ sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -292,7 +292,7 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
}
/*
- * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here
+ * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
* to ensure that after stopping the ineligible update, at least one full
* commit takes place.
*/
@@ -302,13 +302,13 @@ void ext4_fc_stop_ineligible(struct super_block *sb)
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
- EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE;
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
}
static inline int ext4_fc_is_ineligible(struct super_block *sb)
{
- return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) ||
+ return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
}
@@ -358,7 +358,7 @@ static int ext4_fc_track_template(
spin_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
- (sbi->s_mount_state & EXT4_FC_COMMITTING) ?
+ (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
spin_unlock(&sbi->s_fc_lock);
@@ -411,7 +411,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
node->fcd_name.len = dentry->d_name.len;
spin_lock(&sbi->s_fc_lock);
- if (sbi->s_mount_state & EXT4_FC_COMMITTING)
+ if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
list_add_tail(&node->fcd_list,
&sbi->s_fc_dentry_q[FC_Q_STAGING]);
else
@@ -846,7 +846,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
int ret = 0;
spin_lock(&sbi->s_fc_lock);
- sbi->s_mount_state |= EXT4_FC_COMMITTING;
+ sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
@@ -964,7 +964,6 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
fc_dentry->fcd_parent, fc_dentry->fcd_ino,
fc_dentry->fcd_name.len,
fc_dentry->fcd_name.name, crc)) {
- spin_lock(&sbi->s_fc_lock);
ret = -ENOSPC;
goto lock_and_exit;
}
@@ -1191,8 +1190,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_STAGING]);
- sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
- sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE;
+ sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
+ sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
if (full)
sbi->s_fc_bytes = 0;
@@ -1617,8 +1616,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ret == 0) {
/* Range is not mapped */
path = ext4_find_extent(inode, cur, NULL, 0);
- if (!path)
- continue;
+ if (IS_ERR(path)) {
+ iput(inode);
+ return 0;
+ }
memset(&newex, 0, sizeof(newex));
newex.ee_block = cpu_to_le32(cur);
ext4_ext_store_pblock(
@@ -2078,6 +2079,8 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
+ int num_fc_blocks;
+
/*
* We set replay callback even if fast commit disabled because we may
* could still have fast commit blocks that need to be replayed even if
@@ -2087,7 +2090,15 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return;
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
- if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) {
+ if (!buffer_uptodate(journal->j_sb_buffer)
+ && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
+ true)) {
+ ext4_msg(sb, KERN_ERR, "I/O error on journal");
+ return;
+ }
+ num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
+ if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
+ EXT4_NUM_FC_BLKS)) {
pr_warn("Error while enabling fast commits, turning off.");
ext4_clear_feature_fast_commit(sb);
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 2924261226e0..a92eb79de0cc 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -275,7 +275,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
#ifdef CONFIG_UNICODE
- const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
+ const struct unicode_map *um = dir->i_sb->s_encoding;
int r, dlen;
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03c2253005f0..b96a18679a27 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1918,7 +1918,7 @@ static int __ext4_journalled_writepage(struct page *page,
}
if (ret == 0)
ret = err;
- err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -3307,10 +3307,12 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
if (journal) {
if (jbd2_transaction_committed(journal,
- EXT4_I(inode)->i_datasync_tid))
- return true;
- return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) >=
- EXT4_I(inode)->i_fc_committed_subtid;
+ EXT4_I(inode)->i_datasync_tid))
+ return false;
+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+ return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) <
+ EXT4_I(inode)->i_fc_committed_subtid;
+ return true;
}
/* Any metadata buffers to write? */
@@ -6157,7 +6159,8 @@ retry_alloc:
if (ext4_walk_page_buffers(handle, page_buffers(page),
0, len, NULL, write_end_fn))
goto out_error;
- if (ext4_jbd2_inode_add_write(handle, inode, 0, len))
+ if (ext4_jbd2_inode_add_write(handle, inode,
+ page_offset(page), len))
goto out_error;
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
} else {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5159830dacb8..f458d1d81d96 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1285,8 +1285,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
const struct qstr *entry, bool quick)
{
- const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
- const struct unicode_map *um = sbi->s_encoding;
+ const struct super_block *sb = parent->i_sb;
+ const struct unicode_map *um = sb->s_encoding;
int ret;
if (quick)
@@ -1298,7 +1298,7 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
/* Handle invalid character sequence as either an error
* or as an opaque byte sequence.
*/
- if (ext4_has_strict_mode(sbi))
+ if (sb_has_strict_encoding(sb))
return -EINVAL;
if (name->len != entry->len)
@@ -1315,7 +1315,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
{
int len;
- if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+ if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) {
cf_name->name = NULL;
return;
}
@@ -1324,7 +1324,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
if (!cf_name->name)
return;
- len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+ len = utf8_casefold(dir->i_sb->s_encoding,
iname, cf_name->name,
EXT4_NAME_LEN);
if (len <= 0) {
@@ -1361,7 +1361,7 @@ static inline bool ext4_match(const struct inode *parent,
#endif
#ifdef CONFIG_UNICODE
- if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+ if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) {
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
.len = fname->cf_name.len};
@@ -2180,9 +2180,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
-#ifdef CONFIG_UNICODE
- struct ext4_sb_info *sbi;
-#endif
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2199,9 +2196,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return -EINVAL;
#ifdef CONFIG_UNICODE
- sbi = EXT4_SB(sb);
- if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
+ if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
+ sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2fe141ff3c7e..ef4734b40e2a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1288,7 +1288,7 @@ static void ext4_put_super(struct super_block *sb)
fs_put_dax(sbi->s_daxdev);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -4303,7 +4303,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
#ifdef CONFIG_UNICODE
- if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
+ if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
const struct ext4_sb_encodings *encoding_info;
struct unicode_map *encoding;
__u16 encoding_flags;
@@ -4334,8 +4334,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"%s-%s with flags 0x%hx", encoding_info->name,
encoding_info->version?:"\b", encoding_flags);
- sbi->s_encoding = encoding;
- sbi->s_encoding_flags = encoding_flags;
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
}
#endif
@@ -4777,8 +4777,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
sbi->s_fc_bytes = 0;
- sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE;
- sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
+ sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
+ sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
spin_lock_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
@@ -4975,7 +4975,7 @@ no_journal:
}
#ifdef CONFIG_UNICODE
- if (sbi->s_encoding)
+ if (sb->s_encoding)
sb->s_d_op = &ext4_dentry_ops;
#endif
@@ -5184,7 +5184,7 @@ failed_mount:
crypto_free_shash(sbi->s_chksum_driver);
#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ utf8_unload(sb->s_encoding);
#endif
#ifdef CONFIG_QUOTA
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 5ff33d18996a..4e27fe6ed3ae 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -315,6 +315,7 @@ EXT4_ATTR_FEATURE(casefold);
EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
+EXT4_ATTR_FEATURE(fast_commit);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -331,6 +332,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(verity),
#endif
ATTR_LIST(metadata_csum_seed),
+ ATTR_LIST(fast_commit),
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5441c17562c5..d98a2e5dab9f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1078,7 +1078,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
out_free:
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(cachep, gl);
- atomic_dec(&sdp->sd_glock_disposal);
+ if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+ wake_up(&sdp->sd_glock_wait);
out:
return ret;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index aa3f5236befb..6c1432d78dce 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -165,6 +165,31 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
}
/**
+ * gfs2_rgrp_metasync - sync out the metadata of a resource group
+ * @gl: the glock protecting the resource group
+ *
+ */
+
+static int gfs2_rgrp_metasync(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct address_space *metamapping = &sdp->sd_aspace;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
+ const unsigned bsize = sdp->sd_sb.sb_bsize;
+ loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
+ int error;
+
+ filemap_fdatawrite_range(metamapping, start, end);
+ error = filemap_fdatawait_range(metamapping, start, end);
+ WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
+ mapping_set_error(metamapping, error);
+ if (error)
+ gfs2_io_error(sdp);
+ return error;
+}
+
+/**
* rgrp_go_sync - sync out the metadata for this glock
* @gl: the glock
*
@@ -176,11 +201,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static int rgrp_go_sync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
- const unsigned bsize = sdp->sd_sb.sb_bsize;
- loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
- loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
int error;
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -189,10 +210,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl)
gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_RGRP_GO_SYNC);
- filemap_fdatawrite_range(mapping, start, end);
- error = filemap_fdatawait_range(mapping, start, end);
- WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
- mapping_set_error(mapping, error);
+ error = gfs2_rgrp_metasync(gl);
if (!error)
error = gfs2_ail_empty_gl(gl);
gfs2_free_clones(rgd);
@@ -266,7 +284,24 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip)
}
/**
- * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * gfs2_inode_metasync - sync out the metadata of an inode
+ * @gl: the glock protecting the inode
+ *
+ */
+int gfs2_inode_metasync(struct gfs2_glock *gl)
+{
+ struct address_space *metamapping = gfs2_glock2aspace(gl);
+ int error;
+
+ filemap_fdatawrite(metamapping);
+ error = filemap_fdatawait(metamapping);
+ if (error)
+ gfs2_io_error(gl->gl_name.ln_sbd);
+ return error;
+}
+
+/**
+ * inode_go_sync - Sync the dirty metadata of an inode
* @gl: the glock protecting the inode
*
*/
@@ -297,8 +332,7 @@ static int inode_go_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(mapping);
mapping_set_error(mapping, error);
}
- ret = filemap_fdatawait(metamapping);
- mapping_set_error(metamapping, ret);
+ ret = gfs2_inode_metasync(gl);
if (!error)
error = ret;
gfs2_ail_empty_gl(gl);
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 2dd192e85618..695898afcaf1 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -22,6 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
+extern int gfs2_inode_metasync(struct gfs2_glock *gl);
extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6774865f5b5b..077ccb1b3ccc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -180,7 +180,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail;
- gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);
+ if (blktype != GFS2_BLKST_UNLINKED)
+ gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);
glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
io_gl = NULL;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ed69298dd824..3922b26264f5 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -22,6 +22,7 @@
#include "incore.h"
#include "inode.h"
#include "glock.h"
+#include "glops.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
@@ -817,41 +818,19 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
return error;
}
-/**
- * gfs2_meta_sync - Sync all buffers associated with a glock
- * @gl: The glock
- *
- */
-
-void gfs2_meta_sync(struct gfs2_glock *gl)
-{
- struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- int error;
-
- if (mapping == NULL)
- mapping = &sdp->sd_aspace;
-
- filemap_fdatawrite(mapping);
- error = filemap_fdatawait(mapping);
-
- if (error)
- gfs2_io_error(gl->gl_name.ln_sbd);
-}
-
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
if (error) {
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
return;
}
if (pass != 1)
return;
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
@@ -1060,14 +1039,14 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
if (error) {
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
return;
}
if (pass != 1)
return;
/* data sync? */
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 4a3d8aecdf82..fbdbb08dcec6 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -27,8 +27,6 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
-extern void gfs2_meta_sync(struct gfs2_glock *gl);
-
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
unsigned int limit;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7a7e3c10a9a9..61fce59cb4d3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -633,8 +633,10 @@ static int init_statfs(struct gfs2_sbd *sdp)
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
- goto fail;
+ goto out;
}
+ if (sdp->sd_args.ar_spectator)
+ goto out;
pn = gfs2_lookup_simple(master, "per_node");
if (IS_ERR(pn)) {
@@ -682,15 +684,17 @@ free_local:
iput(pn);
put_statfs:
iput(sdp->sd_statfs_inode);
-fail:
+out:
return error;
}
/* Uninitialize and free up memory used by the list of statfs inodes */
static void uninit_statfs(struct gfs2_sbd *sdp)
{
- gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
- free_local_statfs_inodes(sdp);
+ if (!sdp->sd_args.ar_spectator) {
+ gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+ free_local_statfs_inodes(sdp);
+ }
iput(sdp->sd_statfs_inode);
}
@@ -704,7 +708,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (undo) {
jindex = 0;
- goto fail_jinode_gh;
+ goto fail_statfs;
}
sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index b5cbe21efdfb..c26c68ebd29d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -349,7 +349,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd,
mark_buffer_dirty(bh);
brelse(bh);
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
out:
return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ee491bb9c1cc..92d799a193b8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -719,9 +719,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
}
gfs2_free_clones(rgd);
+ return_all_reservations(rgd);
kfree(rgd->rd_bits);
rgd->rd_bits = NULL;
- return_all_reservations(rgd);
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
}
@@ -1370,6 +1370,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ return -EROFS;
+
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b285192bd6b3..b3d951ab8068 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -738,6 +738,7 @@ restart:
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
gfs2_gl_hash_clear(sdp);
+ truncate_inode_pages_final(&sdp->sd_aspace);
gfs2_delete_debugfs_file(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index dcc2aab1b2c4..4ba45caf5939 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -60,7 +60,7 @@ struct hfs_bnode {
wait_queue_head_t lock_wq;
atomic_t refcnt;
unsigned int page_offset;
- struct page *page[0];
+ struct page *page[];
};
#define HFS_BNODE_ERROR 0
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 3b03fff68543..a92de5199ec3 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -117,7 +117,7 @@ struct hfs_bnode {
wait_queue_head_t lock_wq;
atomic_t refcnt;
unsigned int page_offset;
- struct page *page[0];
+ struct page *page[];
};
#define HFS_BNODE_LOCK 0
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 02894df7656d..b53c055bea6a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker,
current->files = work->identity->files;
current->nsproxy = work->identity->nsproxy;
task_unlock(current);
+ if (!work->identity->files) {
+ /* failed grabbing files, ensure work gets cancelled */
+ work->flags |= IO_WQ_WORK_CANCEL;
+ }
}
if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
current->fs = work->identity->fs;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b42dfa0243bf..8018c7076b25 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -995,20 +995,33 @@ static void io_sq_thread_drop_mm(void)
if (mm) {
kthread_unuse_mm(mm);
mmput(mm);
+ current->mm = NULL;
}
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
- if (!current->mm) {
- if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
- !ctx->sqo_task->mm ||
- !mmget_not_zero(ctx->sqo_task->mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_task->mm);
+ struct mm_struct *mm;
+
+ if (current->mm)
+ return 0;
+
+ /* Should never happen */
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
+ return -EFAULT;
+
+ task_lock(ctx->sqo_task);
+ mm = ctx->sqo_task->mm;
+ if (unlikely(!mm || !mmget_not_zero(mm)))
+ mm = NULL;
+ task_unlock(ctx->sqo_task);
+
+ if (mm) {
+ kthread_use_mm(mm);
+ return 0;
}
- return 0;
+ return -EFAULT;
}
static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
@@ -1274,9 +1287,12 @@ static bool io_identity_cow(struct io_kiocb *req)
/* add one for this request */
refcount_inc(&id->count);
- /* drop old identity, assign new one. one ref for req, one for tctx */
- if (req->work.identity != tctx->identity &&
- refcount_sub_and_test(2, &req->work.identity->count))
+ /* drop tctx and req identity references, if needed */
+ if (tctx->identity != &tctx->__identity &&
+ refcount_dec_and_test(&tctx->identity->count))
+ kfree(tctx->identity);
+ if (req->work.identity != &tctx->__identity &&
+ refcount_dec_and_test(&req->work.identity->count))
kfree(req->work.identity);
req->work.identity = id;
@@ -1365,6 +1381,9 @@ static void io_prep_async_work(struct io_kiocb *req)
io_req_init_async(req);
id = req->work.identity;
+ if (req->flags & REQ_F_FORCE_ASYNC)
+ req->work.flags |= IO_WQ_WORK_CONCURRENT;
+
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
@@ -1574,14 +1593,29 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
-static inline bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
+static inline bool __io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
{
+ return ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES)) &&
+ req->work.identity->files == files;
+}
+
+static bool io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ struct io_kiocb *link;
+
if (!files)
return true;
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES))
- return req->work.identity->files == files;
+ if (__io_match_files(req, files))
+ return true;
+ if (req->flags & REQ_F_LINK_HEAD) {
+ list_for_each_entry(link, &req->link_list, link_list) {
+ if (__io_match_files(link, files))
+ return true;
+ }
+ }
return false;
}
@@ -1665,7 +1699,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+ } else if (ctx->cq_overflow_flushed ||
+ atomic_read(&req->task->io_uring->in_idle)) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* then we cannot store the request for later flushing, we need
@@ -1835,7 +1870,7 @@ static void __io_free_req(struct io_kiocb *req)
io_dismantle_req(req);
percpu_counter_dec(&tctx->inflight);
- if (tctx->in_idle)
+ if (atomic_read(&tctx->in_idle))
wake_up(&tctx->wait);
put_task_struct(req->task);
@@ -1846,59 +1881,39 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
-static bool io_link_cancel_timeout(struct io_kiocb *req)
+static void io_kill_linked_timeout(struct io_kiocb *req)
{
- struct io_timeout_data *io = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
- io_cqring_fill_event(req, -ECANCELED);
- io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK_HEAD;
- io_put_req_deferred(req, 1);
- return true;
- }
-
- return false;
-}
-
-static bool __io_kill_linked_timeout(struct io_kiocb *req)
-{
struct io_kiocb *link;
- bool wake_ev;
+ bool cancelled = false;
+ unsigned long flags;
- if (list_empty(&req->link_list))
- return false;
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- if (link->opcode != IORING_OP_LINK_TIMEOUT)
- return false;
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
+ link_list);
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
*/
- if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE))
- return false;
+ if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+ struct io_timeout_data *io = link->async_data;
+ int ret;
- list_del_init(&link->link_list);
- wake_ev = io_link_cancel_timeout(link);
+ list_del_init(&link->link_list);
+ ret = hrtimer_try_to_cancel(&io->timer);
+ if (ret != -1) {
+ io_cqring_fill_event(link, -ECANCELED);
+ io_commit_cqring(ctx);
+ cancelled = true;
+ }
+ }
req->flags &= ~REQ_F_LINK_TIMEOUT;
- return wake_ev;
-}
-
-static void io_kill_linked_timeout(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
- bool wake_ev;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- wake_ev = __io_kill_linked_timeout(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (wake_ev)
+ if (cancelled) {
io_cqring_ev_posted(ctx);
+ io_put_req(link);
+ }
}
static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
@@ -4977,8 +4992,10 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
/* make sure double remove sees this as being gone */
wait->private = NULL;
spin_unlock(&poll->head->lock);
- if (!done)
- __io_async_wake(req, poll, mask, io_poll_task_func);
+ if (!done) {
+ /* use wait func handler, so it matches the rq type */
+ poll->wait.func(&poll->wait, mode, sync, key);
+ }
}
refcount_dec(&req->refs);
return 1;
@@ -6180,7 +6197,6 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
- struct io_kiocb *nxt;
const struct cred *old_creds = NULL;
int ret;
@@ -6206,7 +6222,6 @@ again:
*/
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
if (!io_arm_poll_handler(req)) {
-punt:
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
@@ -6216,33 +6231,25 @@ punt:
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
- goto exit;
- }
+ } else if (likely(!ret)) {
+ /* drop submission reference */
+ req = io_put_req_find_next(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
- if (unlikely(ret)) {
+ if (req) {
+ if (!(req->flags & REQ_F_FORCE_ASYNC))
+ goto again;
+ io_queue_async_work(req);
+ }
+ } else {
/* un-prep timeout, so it'll be killed as any other linked */
req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
io_put_req(req);
io_req_complete(req, ret);
- goto exit;
}
- /* drop submission reference */
- nxt = io_put_req_find_next(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
-
- if (nxt) {
- req = nxt;
-
- if (req->flags & REQ_F_FORCE_ASYNC) {
- linked_timeout = NULL;
- goto punt;
- }
- goto again;
- }
-exit:
if (old_creds)
revert_creds(old_creds);
}
@@ -6266,13 +6273,6 @@ fail_req:
if (unlikely(ret))
goto fail_req;
}
-
- /*
- * Never try inline submit of IOSQE_ASYNC is set, go straight
- * to async execution.
- */
- io_req_init_async(req);
- req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
if (sqe) {
@@ -7727,7 +7727,8 @@ static int io_uring_alloc_task_context(struct task_struct *task)
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
- tctx->in_idle = 0;
+ atomic_set(&tctx->in_idle, 0);
+ tctx->sqpoll = false;
io_init_identity(&tctx->__identity);
tctx->identity = &tctx->__identity;
task->io_uring = tctx;
@@ -8420,22 +8421,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
return false;
}
-static bool io_match_link_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/*
* We're looking to cancel 'req' because it's holding on to our files, but
* 'req' could be a link to another request. See if it is, and cancel that
@@ -8485,7 +8470,21 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
{
- return io_match_link(container_of(work, struct io_kiocb, work), data);
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ bool ret;
+
+ if (req->flags & REQ_F_LINK_TIMEOUT) {
+ unsigned long flags;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /* protect against races with linked timeouts */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ret = io_match_link(req, data);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ ret = io_match_link(req, data);
+ }
+ return ret;
}
static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
@@ -8511,6 +8510,7 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
}
static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
struct io_defer_entry *de = NULL;
@@ -8518,7 +8518,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_link_files(de->req, files)) {
+ if (io_task_match(de->req, task) &&
+ io_match_files(de->req, files)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -8544,7 +8545,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
if (list_empty_careful(&ctx->inflight_list))
return false;
- io_cancel_defer_files(ctx, files);
/* cancel all at once, should be faster than doing it one by one*/
io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
@@ -8630,8 +8630,16 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
{
struct task_struct *task = current;
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
task = ctx->sq_data->thread;
+ atomic_inc(&task->io_uring->in_idle);
+ io_sq_thread_park(ctx->sq_data);
+ }
+
+ if (files)
+ io_cancel_defer_files(ctx, NULL, files);
+ else
+ io_cancel_defer_files(ctx, task, NULL);
io_cqring_overflow_flush(ctx, true, task, files);
@@ -8639,12 +8647,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
io_run_task_work();
cond_resched();
}
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+ atomic_dec(&task->io_uring->in_idle);
+ /*
+ * If the files that are going away are the ones in the thread
+ * identity, clear them out.
+ */
+ if (task->io_uring->identity->files == files)
+ task->io_uring->identity->files = NULL;
+ io_sq_thread_unpark(ctx->sq_data);
+ }
}
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static int io_uring_add_task_file(struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
{
struct io_uring_task *tctx = current->io_uring;
@@ -8666,6 +8685,14 @@ static int io_uring_add_task_file(struct file *file)
tctx->last = file;
}
+ /*
+ * This is race safe in that the task itself is doing this, hence it
+ * cannot be going through the exit/cancel paths at the same time.
+ * This cannot be modified while exit/cancel is running.
+ */
+ if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
+ tctx->sqpoll = true;
+
return 0;
}
@@ -8707,7 +8734,7 @@ void __io_uring_files_cancel(struct files_struct *files)
unsigned long index;
/* make sure overflow events are dropped */
- tctx->in_idle = true;
+ atomic_inc(&tctx->in_idle);
xa_for_each(&tctx->xa, index, file) {
struct io_ring_ctx *ctx = file->private_data;
@@ -8716,6 +8743,35 @@ void __io_uring_files_cancel(struct files_struct *files)
if (files)
io_uring_del_task_file(file);
}
+
+ atomic_dec(&tctx->in_idle);
+}
+
+static s64 tctx_inflight(struct io_uring_task *tctx)
+{
+ unsigned long index;
+ struct file *file;
+ s64 inflight;
+
+ inflight = percpu_counter_sum(&tctx->inflight);
+ if (!tctx->sqpoll)
+ return inflight;
+
+ /*
+ * If we have SQPOLL rings, then we need to iterate and find them, and
+ * add the pending count for those.
+ */
+ xa_for_each(&tctx->xa, index, file) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
+
+ inflight += percpu_counter_sum(&__tctx->inflight);
+ }
+ }
+
+ return inflight;
}
/*
@@ -8729,11 +8785,11 @@ void __io_uring_task_cancel(void)
s64 inflight;
/* make sure overflow events are dropped */
- tctx->in_idle = true;
+ atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = percpu_counter_sum(&tctx->inflight);
+ inflight = tctx_inflight(tctx);
if (!inflight)
break;
__io_uring_files_cancel(NULL);
@@ -8744,13 +8800,13 @@ void __io_uring_task_cancel(void)
* If we've seen completions, retry. This avoids a race where
* a completion comes in before we did prepare_to_wait().
*/
- if (inflight != percpu_counter_sum(&tctx->inflight))
+ if (inflight != tctx_inflight(tctx))
continue;
schedule();
} while (1);
finish_wait(&tctx->wait, &wait);
- tctx->in_idle = false;
+ atomic_dec(&tctx->in_idle);
}
static int io_uring_flush(struct file *file, void *data)
@@ -8895,7 +8951,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_sqpoll_wait_sq(ctx);
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(f.file);
+ ret = io_uring_add_task_file(ctx, f.file);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -8932,7 +8988,8 @@ out_fput:
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
- const struct cred *cred = p;
+ struct io_identity *iod = p;
+ const struct cred *cred = iod->creds;
struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
@@ -9124,7 +9181,7 @@ err_fd:
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
#endif
- if (unlikely(io_uring_add_task_file(file))) {
+ if (unlikely(io_uring_add_task_file(ctx, file))) {
file = ERR_PTR(-ENOMEM);
goto err_fd;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8180061b9e16..10cc7979ce38 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1374,6 +1374,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
WARN_ON_ONCE(!PageLocked(page));
WARN_ON_ONCE(PageWriteback(page));
+ WARN_ON_ONCE(PageDirty(page));
/*
* We cannot cancel the ioend directly here on error. We may have
@@ -1382,33 +1383,22 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* appropriately.
*/
if (unlikely(error)) {
+ /*
+ * Let the filesystem know what portion of the current page
+ * failed to map. If the page wasn't been added to ioend, it
+ * won't be affected by I/O completion and we must unlock it
+ * now.
+ */
+ if (wpc->ops->discard_page)
+ wpc->ops->discard_page(page, file_offset);
if (!count) {
- /*
- * If the current page hasn't been added to ioend, it
- * won't be affected by I/O completions and we must
- * discard and unlock it right here.
- */
- if (wpc->ops->discard_page)
- wpc->ops->discard_page(page);
ClearPageUptodate(page);
unlock_page(page);
goto done;
}
-
- /*
- * If the page was not fully cleaned, we need to ensure that the
- * higher layers come back to it correctly. That means we need
- * to keep the page dirty, and for WB_SYNC_ALL writeback we need
- * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
- * so another attempt to write this page in this writeback sweep
- * will be made.
- */
- set_page_writeback_keepwrite(page);
- } else {
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
}
+ set_page_writeback(page);
unlock_page(page);
/*
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
index 1558cf22ef8a..ee9660e9671c 100644
--- a/fs/isofs/rock.h
+++ b/fs/isofs/rock.h
@@ -22,7 +22,7 @@ struct SU_ER_s {
__u8 len_des;
__u8 len_src;
__u8 ext_ver;
- __u8 data[0];
+ __u8 data[];
} __attribute__ ((packed));
struct RR_RR_s {
@@ -44,7 +44,7 @@ struct RR_PN_s {
struct SL_component {
__u8 flags;
__u8 len;
- __u8 text[0];
+ __u8 text[];
} __attribute__ ((packed));
struct RR_SL_s {
@@ -54,7 +54,7 @@ struct RR_SL_s {
struct RR_NM_s {
__u8 flags;
- char name[0];
+ char name[];
} __attribute__ ((packed));
struct RR_CL_s {
@@ -71,7 +71,7 @@ struct stamp {
struct RR_TF_s {
__u8 flags;
- struct stamp times[0]; /* Variable number of these beasts */
+ struct stamp times[]; /* Variable number of these beasts */
} __attribute__ ((packed));
/* Linux-specific extension for transparent decompression */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0f707003dda5..b362523a9829 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1049,6 +1049,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
OOM_SCORE_ADJ_MAX;
put_task_struct(task);
+ if (oom_adj > OOM_ADJUST_MAX)
+ oom_adj = OOM_ADJUST_MAX;
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index d0989a443c77..419760fd77bd 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -19,7 +19,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
static const struct proc_ops cpuinfo_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = cpuinfo_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2f9fa179194d..b84663252add 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -590,7 +590,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
static const struct proc_ops proc_seq_ops = {
/* not permanent -- can call into arbitrary seq_operations */
.proc_open = proc_seq_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = proc_seq_release,
};
@@ -621,7 +621,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
static const struct proc_ops proc_single_ops = {
/* not permanent -- can call into arbitrary ->single_show */
.proc_open = proc_single_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 58c075e2a452..bde6b6f69852 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -597,6 +597,7 @@ static const struct file_operations proc_iter_file_ops = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
.write = proc_reg_write,
+ .splice_read = generic_file_splice_read,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
@@ -622,6 +623,7 @@ static const struct file_operations proc_reg_file_ops_compat = {
static const struct file_operations proc_iter_file_ops_compat = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
+ .splice_read = generic_file_splice_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..4695b6de3151 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -226,7 +226,7 @@ static int stat_open(struct inode *inode, struct file *file)
static const struct proc_ops stat_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
diff --git a/fs/select.c b/fs/select.c
index 7aef49552d4c..ebfebdfe5c69 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -97,7 +97,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
struct poll_table_page {
struct poll_table_page * next;
struct poll_table_entry * entry;
- struct poll_table_entry entries[0];
+ struct poll_table_entry entries[];
};
#define POLL_TABLE_FULL(table) \
@@ -836,7 +836,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
struct poll_list {
struct poll_list *next;
int len;
- struct pollfd entries[0];
+ struct pollfd entries[];
};
#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 31219c1db17d..3b20e21604e7 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/string_helpers.h>
+#include <linux/uio.h>
#include <linux/uaccess.h>
#include <asm/page.h>
@@ -146,7 +147,28 @@ Eoverflow:
*/
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
- struct seq_file *m = file->private_data;
+ struct iovec iov = { .iov_base = buf, .iov_len = size};
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, file);
+ iov_iter_init(&iter, READ, &iov, 1, size);
+
+ kiocb.ki_pos = *ppos;
+ ret = seq_read_iter(&kiocb, &iter);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+EXPORT_SYMBOL(seq_read);
+
+/*
+ * Ready-made ->f_op->read_iter()
+ */
+ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct seq_file *m = iocb->ki_filp->private_data;
+ size_t size = iov_iter_count(iter);
size_t copied = 0;
size_t n;
void *p;
@@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
* if request is to read from zero offset, reset iterator to first
* record as it might have been already advanced by previous requests
*/
- if (*ppos == 0) {
+ if (iocb->ki_pos == 0) {
m->index = 0;
m->count = 0;
}
- /* Don't assume *ppos is where we left it */
- if (unlikely(*ppos != m->read_pos)) {
- while ((err = traverse(m, *ppos)) == -EAGAIN)
+ /* Don't assume ki_pos is where we left it */
+ if (unlikely(iocb->ki_pos != m->read_pos)) {
+ while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
;
if (err) {
/* With prejudice... */
@@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->count = 0;
goto Done;
} else {
- m->read_pos = *ppos;
+ m->read_pos = iocb->ki_pos;
}
}
@@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
/* if not empty - flush it first */
if (m->count) {
n = min(m->count, size);
- err = copy_to_user(buf, m->buf + m->from, n);
- if (err)
+ if (copy_to_iter(m->buf + m->from, n, iter) != n)
goto Efault;
m->count -= n;
m->from += n;
size -= n;
- buf += n;
copied += n;
if (!size)
goto Done;
@@ -254,8 +274,7 @@ Fill:
}
m->op->stop(m, p);
n = min(m->count, size);
- err = copy_to_user(buf, m->buf, n);
- if (err)
+ if (copy_to_iter(m->buf, n, iter) != n)
goto Efault;
copied += n;
m->count -= n;
@@ -264,7 +283,7 @@ Done:
if (!copied)
copied = err;
else {
- *ppos += copied;
+ iocb->ki_pos += copied;
m->read_pos += copied;
}
mutex_unlock(&m->lock);
@@ -276,7 +295,7 @@ Efault:
err = -EFAULT;
goto Done;
}
-EXPORT_SYMBOL(seq_read);
+EXPORT_SYMBOL(seq_read_iter);
/**
* seq_lseek - ->llseek() method for sequential files.
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 852b536551b5..15640015be9d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2467,6 +2467,7 @@ xfs_defer_agfl_block(
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
new->xefi_oinfo = *oinfo;
+ new->xefi_skip_discard = false;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e1bd484e5548..6747e97a7949 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_extent_free_item
{
xfs_fsblock_t xefi_startblock;/* starting fs block number */
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ bool xefi_skip_discard;
struct list_head xefi_list;
struct xfs_owner_info xefi_oinfo; /* extent owner */
- bool xefi_skip_discard;
};
#define XFS_BMAP_MAX_NMAP 4
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 3aa85b64de36..bb25ff1b770d 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -121,8 +121,7 @@ xchk_inode_flags(
goto bad;
/* rt flags require rt device */
- if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) &&
- !mp->m_rtdev_targp)
+ if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
goto bad;
/* new rt bitmap flag only valid for rbmino */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 55d126d4e096..4304c6416fbb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -346,8 +346,8 @@ xfs_map_blocks(
ssize_t count = i_blocksize(inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_fileoff_t cow_fsb = NULLFILEOFF;
- int whichfork = XFS_DATA_FORK;
+ xfs_fileoff_t cow_fsb;
+ int whichfork;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
@@ -381,6 +381,8 @@ xfs_map_blocks(
* landed in a hole and we skip the block.
*/
retry:
+ cow_fsb = NULLFILEOFF;
+ whichfork = XFS_DATA_FORK;
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
@@ -527,13 +529,15 @@ xfs_prepare_ioend(
*/
static void
xfs_discard_page(
- struct page *page)
+ struct page *page,
+ loff_t fileoff)
{
struct inode *inode = page->mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- loff_t offset = page_offset(page);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
+ unsigned int pageoff = offset_in_page(fileoff);
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff);
+ xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
int error;
if (XFS_FORCED_SHUTDOWN(mp))
@@ -541,14 +545,14 @@ xfs_discard_page(
xfs_alert_ratelimited(mp,
"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
- page, ip->i_ino, offset);
+ page, ip->i_ino, fileoff);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- i_blocks_per_page(inode, page));
+ i_blocks_per_page(inode, page) - pageoff_fsb);
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
- iomap_invalidatepage(page, 0, PAGE_SIZE);
+ iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5e165456da68..1414ab79eacf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -911,6 +911,16 @@ xfs_setattr_size(
error = iomap_zero_range(inode, oldsize, newsize - oldsize,
&did_zeroing, &xfs_buffered_write_iomap_ops);
} else {
+ /*
+ * iomap won't detect a dirty page over an unwritten block (or a
+ * cow block over a hole) and subsequently skips zeroing the
+ * newly post-EOF portion of the page. Flush the new EOF to
+ * convert the block before the pagecache truncate.
+ */
+ error = filemap_write_and_wait_range(inode->i_mapping, newsize,
+ newsize);
+ if (error)
+ return error;
error = iomap_truncate_page(inode, newsize, &did_zeroing,
&xfs_buffered_write_iomap_ops);
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 16098dc42add..6fa05fb78189 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1502,7 +1502,8 @@ xfs_reflink_unshare(
&xfs_buffered_write_iomap_ops);
if (error)
goto out;
- error = filemap_write_and_wait(inode->i_mapping);
+
+ error = filemap_write_and_wait_range(inode->i_mapping, offset, len);
if (error)
goto out;