summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /fs
parent91ec4b0d11fe115581ce2835300558802ce55e6c (diff)
parent4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff)
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/afs/dir_edit.c2
-rw-r--r--fs/afs/file.c14
-rw-r--r--fs/afs/inode.c27
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/buffer.c89
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/inline.c19
-rw-r--r--fs/ext4/inode.c14
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/verity.c6
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c3
-rw-r--r--fs/iomap/buffered-io.c11
-rw-r--r--fs/mpage.c66
-rw-r--r--fs/netfs/buffered_read.c4
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nilfs2/page.c6
-rw-r--r--fs/orangefs/inode.c9
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/kcore.c85
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/userfaultfd.c45
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_file.c17
30 files changed, 275 insertions, 210 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index e99830c65033..cc07a0cd3172 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -250,16 +250,9 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
-#
-# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
-#
-config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
- bool
-
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ depends on ARCH_WANT_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 82690d1dd49a..f92b9e62d567 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -319,16 +319,16 @@ expand:
struct folio *folio;
folio = filemap_get_folio(mapping, i);
- if (!folio) {
+ if (IS_ERR(folio)) {
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
-
- ret = -ENOMEM;
folio = __filemap_get_folio(mapping,
i, FGP_LOCK | FGP_CREAT,
mapping->gfp_mask);
- if (!folio)
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto error;
+ }
folio_attach_private(folio, (void *)1);
folio_unlock(folio);
}
@@ -524,7 +524,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
*/
folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
FGP_ACCESSED, 0);
- if (!folio) {
+ if (IS_ERR(folio)) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 0ab7752d1b75..f0eddccbdd95 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping->gfp_mask);
- if (!folio)
+ if (IS_ERR(folio))
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
else if (folio && !folio_test_private(folio))
folio_attach_private(folio, (void *)1);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 68d6d5dc608d..719b31374879 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma)
static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
- struct afs_file *af = vmf->vma->vm_file->private_data;
- switch (afs_validate(vnode, af->key)) {
- case 0:
+ if (afs_pagecache_valid(vnode))
return filemap_map_pages(vmf, start_pgoff, end_pgoff);
- case -ENOMEM:
- return VM_FAULT_OOM;
- case -EINTR:
- case -ERESTARTSYS:
- return VM_FAULT_RETRY;
- case -ESTALE:
- default:
- return VM_FAULT_SIGBUS;
- }
+ return 0;
}
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0167e96e5198..b1bdffd5e888 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -668,6 +668,24 @@ bool afs_check_validity(struct afs_vnode *vnode)
}
/*
+ * Returns true if the pagecache is still valid. Does not sleep.
+ */
+bool afs_pagecache_valid(struct afs_vnode *vnode)
+{
+ if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
+ return true;
+ }
+
+ if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
+ afs_check_validity(vnode))
+ return true;
+
+ return false;
+}
+
+/*
* validate a vnode/inode
* - there are several things we need to check
* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -684,14 +702,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->fid.vid, vnode->fid.vnode, vnode->flags,
key_serial(key));
- if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->netfs.inode.i_nlink)
- clear_nlink(&vnode->netfs.inode);
- goto valid;
- }
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_check_validity(vnode))
+ if (afs_pagecache_valid(vnode))
goto valid;
down_write(&vnode->validate_lock);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ad8523d0d038..5c95df6621f9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
extern bool afs_check_validity(struct afs_vnode *);
extern int afs_validate(struct afs_vnode *, struct key *);
+bool afs_pagecache_valid(struct afs_vnode *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
struct kstat *, u32, unsigned int);
extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 571f3b9a417e..c822d6006033 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping,
_debug("kill %lx (to %lx)", index, last);
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
next = index + 1;
continue;
}
@@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
_debug("redirty %llx @%llx", len, start);
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
next = index + 1;
continue;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index b3eb905f87d6..a7fc561758b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -842,7 +842,7 @@ int remove_inode_buffers(struct inode *inode)
}
/*
- * Create the appropriate buffers when given a page for data area and
+ * Create the appropriate buffers when given a folio for data area and
* the size of each buffer.. Use the bh->b_this_page linked list to
* follow the buffers created. Return NULL if unable to create more
* buffers.
@@ -850,8 +850,8 @@ int remove_inode_buffers(struct inode *inode)
* The retry flag is used to differentiate async IO (paging, swapping)
* which may not fail from ordinary buffer allocations.
*/
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
- bool retry)
+struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
+ bool retry)
{
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
@@ -861,12 +861,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
if (retry)
gfp |= __GFP_NOFAIL;
- /* The page lock pins the memcg */
- memcg = page_memcg(page);
+ /* The folio lock pins the memcg */
+ memcg = folio_memcg(folio);
old_memcg = set_active_memcg(memcg);
head = NULL;
- offset = PAGE_SIZE;
+ offset = folio_size(folio);
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(gfp);
if (!bh)
@@ -878,8 +878,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bh->b_size = size;
- /* Link the buffer to its page */
- set_bh_page(bh, page, offset);
+ /* Link the buffer to its folio */
+ folio_set_bh(bh, folio, offset);
}
out:
set_active_memcg(old_memcg);
@@ -898,6 +898,13 @@ no_grow:
goto out;
}
+EXPORT_SYMBOL_GPL(folio_alloc_buffers);
+
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+ bool retry)
+{
+ return folio_alloc_buffers(page_folio(page), size, retry);
+}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
static inline void
@@ -1484,6 +1491,21 @@ void set_bh_page(struct buffer_head *bh,
}
EXPORT_SYMBOL(set_bh_page);
+void folio_set_bh(struct buffer_head *bh, struct folio *folio,
+ unsigned long offset)
+{
+ bh->b_folio = folio;
+ BUG_ON(offset >= folio_size(folio));
+ if (folio_test_highmem(folio))
+ /*
+ * This catches illegal uses and preserves the offset:
+ */
+ bh->b_data = (char *)(0 + offset);
+ else
+ bh->b_data = folio_address(folio) + offset;
+}
+EXPORT_SYMBOL(folio_set_bh);
+
/*
* Called when truncating a buffer on a page completely.
*/
@@ -1571,18 +1593,17 @@ out:
}
EXPORT_SYMBOL(block_invalidate_folio);
-
/*
* We attach and possibly dirty the buffers atomically wrt
* block_dirty_folio() via private_lock. try_to_free_buffers
- * is already excluded via the page lock.
+ * is already excluded via the folio lock.
*/
-void create_empty_buffers(struct page *page,
- unsigned long blocksize, unsigned long b_state)
+void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
+ unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
- head = alloc_page_buffers(page, blocksize, true);
+ head = folio_alloc_buffers(folio, blocksize, true);
bh = head;
do {
bh->b_state |= b_state;
@@ -1591,19 +1612,26 @@ void create_empty_buffers(struct page *page,
} while (bh);
tail->b_this_page = head;
- spin_lock(&page->mapping->private_lock);
- if (PageUptodate(page) || PageDirty(page)) {
+ spin_lock(&folio->mapping->private_lock);
+ if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
bh = head;
do {
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
set_buffer_dirty(bh);
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
bh = bh->b_this_page;
} while (bh != head);
}
- attach_page_private(page, head);
- spin_unlock(&page->mapping->private_lock);
+ folio_attach_private(folio, head);
+ spin_unlock(&folio->mapping->private_lock);
+}
+EXPORT_SYMBOL(folio_create_empty_buffers);
+
+void create_empty_buffers(struct page *page,
+ unsigned long blocksize, unsigned long b_state)
+{
+ folio_create_empty_buffers(page_folio(page), blocksize, b_state);
}
EXPORT_SYMBOL(create_empty_buffers);
@@ -1694,14 +1722,17 @@ static inline int block_size_bits(unsigned int blocksize)
return ilog2(blocksize);
}
-static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+static struct buffer_head *folio_create_buffers(struct folio *folio,
+ struct inode *inode,
+ unsigned int b_state)
{
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
- b_state);
- return page_buffers(page);
+ if (!folio_buffers(folio))
+ folio_create_empty_buffers(folio,
+ 1 << READ_ONCE(inode->i_blkbits),
+ b_state);
+ return folio_buffers(folio);
}
/*
@@ -1745,8 +1776,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
int nr_underway = 0;
blk_opf_t write_flags = wbc_to_write_flags(wbc);
- head = create_page_buffers(page, inode,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ head = folio_create_buffers(page_folio(page), inode,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
/*
* Be very careful. We have no exclusion from block_dirty_folio
@@ -2009,7 +2040,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
- head = create_page_buffers(&folio->page, inode, 0);
+ head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
@@ -2295,7 +2326,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- head = create_page_buffers(&folio->page, inode, 0);
+ head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..87cf3a2f0e9a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1034,7 +1034,7 @@ static int exec_mmap(struct mm_struct *mm)
mmput(old_mm);
return 0;
}
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
return 0;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index b9fb1177fff6..859bc4e2c9b0 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -566,9 +566,9 @@ retry:
* started */
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
- if (!folio) {
- ret = -ENOMEM;
- goto out;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out_nofolio;
}
ext4_write_lock_xattr(inode, &no_expand);
@@ -633,6 +633,7 @@ out:
folio_unlock(folio);
folio_put(folio);
}
+out_nofolio:
if (sem_held)
ext4_write_unlock_xattr(inode, &no_expand);
if (handle)
@@ -693,8 +694,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
- if (!folio) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto out;
}
@@ -854,8 +855,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
@@ -947,8 +948,8 @@ retry_journal:
*/
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
- if (!folio) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto out_journal;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbd352e3986..ffbbd9626bd8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1162,8 +1162,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
retry_grab:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -2906,8 +2906,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/* In case writeback began while the folio was unlocked */
folio_wait_stable(folio);
@@ -3618,8 +3618,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping_gfp_constraint(mapping, ~__GFP_FS));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
blocksize = inode->i_sb->s_blocksize;
@@ -5201,7 +5201,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
while (1) {
struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!folio)
+ if (IS_ERR(folio))
return;
ret = __ext4_journalled_invalidate_folio(folio, offset,
folio_size(folio) - offset);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e509c22a21ed..b5af2fc03b2f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -140,18 +140,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
flags = memalloc_nofs_save();
folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping[0]));
- if (!folio[0]) {
+ if (IS_ERR(folio[0])) {
memalloc_nofs_restore(flags);
- return -ENOMEM;
+ return PTR_ERR(folio[0]);
}
folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping[1]));
memalloc_nofs_restore(flags);
- if (!folio[1]) {
+ if (IS_ERR(folio[1])) {
folio_unlock(folio[0]);
folio_put(folio[0]);
- return -ENOMEM;
+ return PTR_ERR(folio[1]);
}
/*
* __filemap_get_folio() may not wait on folio's writeback if
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 3b01247066dd..2f37e1ea3955 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -366,14 +366,16 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
- if (!folio || !folio_test_uptodate(folio)) {
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
- if (folio)
+ if (!IS_ERR(folio))
folio_put(folio);
else if (num_ra_pages > 1)
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
folio = read_mapping_folio(inode->i_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
return folio_file_page(folio, index);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9062da6da567..ecfdfb2529a3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
@@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
struct folio *folio;
folio = filemap_lock_folio(mapping, idx);
- if (!folio)
+ if (IS_ERR(folio))
return;
start = start & ~huge_page_mask(h);
diff --git a/fs/inode.c b/fs/inode.c
index 3ec5a8f7b644..577799b7855f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += reap;
+ mm_account_reclaimed_pages(reap);
}
iput(inode);
spin_lock(lru_lock);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 10a203515583..063133ec77f4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
{
unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
- struct folio *folio;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
- if (folio)
- return folio;
-
- if (iter->flags & IOMAP_NOWAIT)
- return ERR_PTR(-EAGAIN);
- return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(iomap_get_folio);
@@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
start_byte >> PAGE_SHIFT);
- if (!folio) {
+ if (IS_ERR(folio)) {
start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
PAGE_SIZE;
continue;
diff --git a/fs/mpage.c b/fs/mpage.c
index 22b9de5ddd68..242e213ee064 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,23 +43,49 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io(struct bio *bio)
+static void mpage_read_end_io(struct bio *bio)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
+ int err = blk_status_to_errno(bio->bi_status);
+
+ bio_for_each_folio_all(fi, bio) {
+ if (err)
+ folio_set_error(fi.folio);
+ else
+ folio_mark_uptodate(fi.folio);
+ folio_unlock(fi.folio);
+ }
+
+ bio_put(bio);
+}
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
- page_endio(page, bio_op(bio),
- blk_status_to_errno(bio->bi_status));
+static void mpage_write_end_io(struct bio *bio)
+{
+ struct folio_iter fi;
+ int err = blk_status_to_errno(bio->bi_status);
+
+ bio_for_each_folio_all(fi, bio) {
+ if (err) {
+ folio_set_error(fi.folio);
+ mapping_set_error(fi.folio->mapping, err);
+ }
+ folio_end_writeback(fi.folio);
}
bio_put(bio);
}
-static struct bio *mpage_bio_submit(struct bio *bio)
+static struct bio *mpage_bio_submit_read(struct bio *bio)
+{
+ bio->bi_end_io = mpage_read_end_io;
+ guard_bio_eod(bio);
+ submit_bio(bio);
+ return NULL;
+}
+
+static struct bio *mpage_bio_submit_write(struct bio *bio)
{
- bio->bi_end_io = mpage_end_io;
+ bio->bi_end_io = mpage_write_end_io;
guard_bio_eod(bio);
submit_bio(bio);
return NULL;
@@ -265,7 +291,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* This folio will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
alloc_new:
if (args->bio == NULL) {
@@ -278,7 +304,7 @@ alloc_new:
length = first_hole << blkbits;
if (!bio_add_folio(args->bio, folio, length, 0)) {
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
goto alloc_new;
}
@@ -286,7 +312,7 @@ alloc_new:
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
else
args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
@@ -294,7 +320,7 @@ out:
confused:
if (args->bio)
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
if (!folio_test_uptodate(folio))
block_read_full_folio(folio, args->get_block);
else
@@ -356,7 +382,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
}
if (args.bio)
- mpage_bio_submit(args.bio);
+ mpage_bio_submit_read(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);
@@ -373,7 +399,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
if (args.bio)
- mpage_bio_submit(args.bio);
+ mpage_bio_submit_read(args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_read_folio);
@@ -577,7 +603,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
alloc_new:
if (bio == NULL) {
@@ -596,7 +622,7 @@ alloc_new:
wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
length = first_unmapped << blkbits;
if (!bio_add_folio(bio, folio, length, 0)) {
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
goto alloc_new;
}
@@ -606,7 +632,7 @@ alloc_new:
folio_start_writeback(folio);
folio_unlock(folio);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -618,7 +644,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
/*
* The caller has a ref on the inode, so *mapping is stable
@@ -652,7 +678,7 @@ mpage_writepages(struct address_space *mapping,
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio)
- mpage_bio_submit(mpd.bio);
+ mpage_bio_submit_write(mpd.bio);
blk_finish_plug(&plug);
return ret;
}
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index e3d754a9e1b0..3404707ddbe7 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -349,8 +349,8 @@ int netfs_write_begin(struct netfs_inode *ctx,
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2474cbc30712..f0edf5a36237 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,8 +328,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
start:
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
*pagep = &folio->page;
ret = nfs_flush_incompatible(file, folio);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 41ccd43cd979..5cf30827f244 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -259,10 +259,10 @@ repeat:
NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
dfolio = filemap_grab_folio(dmap, folio->index);
- if (unlikely(!dfolio)) {
+ if (unlikely(IS_ERR(dfolio))) {
/* No empty page is added to the page cache */
- err = -ENOMEM;
folio_unlock(folio);
+ err = PTR_ERR(dfolio);
break;
}
if (unlikely(!folio_buffers(folio)))
@@ -311,7 +311,7 @@ repeat:
folio_lock(folio);
dfolio = filemap_lock_folio(dmap, index);
- if (dfolio) {
+ if (!IS_ERR(dfolio)) {
/* overwrite existing folio in the destination cache */
WARN_ON(folio_test_dirty(dfolio));
nilfs_copy_page(&dfolio->page, &folio->page, 0);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aefdf1d3be7c..9014bbcc8031 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -244,7 +244,7 @@ static void orangefs_readahead(struct readahead_control *rac)
struct iov_iter iter;
struct inode *inode = rac->mapping->host;
struct xarray *i_pages;
- struct page *page;
+ struct folio *folio;
loff_t new_start = readahead_pos(rac);
int ret;
size_t new_len = 0;
@@ -275,9 +275,10 @@ static void orangefs_readahead(struct readahead_control *rac)
ret = 0;
/* clean up. */
- while ((page = readahead_page(rac))) {
- page_endio(page, false, ret);
- put_page(page);
+ while ((folio = readahead_folio(rac))) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5e0e0ccd47aa..96a6a08c8235 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -96,6 +96,7 @@
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
+#include <linux/ksm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -3207,6 +3208,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+ seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 71157ee35c1a..25b44b303b35 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -24,7 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
@@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
*i = ALIGN(*i + descsz, 4);
}
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- char *buf = file->private_data;
+ loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
@@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
size_t tsz;
int nphdr;
unsigned long start;
+ size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
@@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
};
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
- if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
- if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
- tsz)) {
+ if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+ iter) != tsz) {
kfree(phdrs);
ret = -EFAULT;
goto out;
}
kfree(phdrs);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
min(vmcoreinfo_size, notes_len - i));
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
- if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
goto out;
}
kfree(notes);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
if (!m) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
switch (m->type) {
case KCORE_VMALLOC:
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
+ {
+ const char *src = (char *)start;
+ size_t read = 0, left = tsz;
+
+ /*
+ * vmalloc uses spinlocks, so we optimistically try to
+ * read memory. If this fails, fault pages in and try
+ * again until we are done.
+ */
+ while (true) {
+ read += vread_iter(iter, src, left);
+ if (read == tsz)
+ break;
+
+ src += read;
+ left -= read;
+
+ if (fault_in_iov_iter_writeable(iter, left)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
break;
+ }
case KCORE_USER:
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz)) {
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
*/
if (!page || PageOffline(page) ||
is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
+ * We use _copy_to_iter() to bypass usermode hardening
+ * which would otherwise prevent this operation.
*/
- if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
}
break;
default:
pr_warn_once("Unhandled KCORE type: %d\n", m->type);
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
skip:
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
@@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
- filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
-
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
-static int release_kcore(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
static const struct proc_ops kcore_proc_ops = {
- .proc_read = read_kcore,
+ .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
- .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 440960110a42..b43d0bd42762 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
#include <linux/seq_file.h>
@@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+#ifdef CONFIG_MEMTEST
+ if (early_memtest_done) {
+ unsigned long early_memtest_bad_size_kb;
+
+ early_memtest_bad_size_kb = early_memtest_bad_size>>10;
+ if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+ early_memtest_bad_size_kb = 1;
+ /* When 0 is reported, it means there actually was a successful test */
+ seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
+ }
+#endif
+
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6a96e1713fd5..cb49479acd2e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
if (start >= vma->vm_end)
return;
-#ifdef CONFIG_SHMEM
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
ops = &smaps_shmem_walk_ops;
}
}
-#endif
+
/* mmap_lock is held in m_start */
if (!start)
walk_page_vma(vma, ops, mss);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2f67516bb9bf..9fbb9b5256f7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
- if (unlikely(order >= MAX_ORDER))
+ if (unlikely(order > MAX_ORDER))
return -EFBIG;
ret = inode_newsize_ok(inode, newsize);
diff --git a/fs/super.c b/fs/super.c
index 04bc62ab7dfe..34afe411cf2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 28e942feacc8..0fd96d6e39ce 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -123,6 +123,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx)
+ return false;
+
+ return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t flags)
{
@@ -1644,7 +1659,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
- uffd_wp_range(mm, vma, start, vma_end - start, false);
+ uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
@@ -1729,6 +1744,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_copy = (struct uffdio_copy __user *) arg;
@@ -1755,10 +1771,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
+ if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- uffdio_copy.mode);
+ ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, &ctx->mmap_changing,
+ flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1808,9 +1826,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1890,6 +1908,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_continue = (struct uffdio_continue __user *)arg;
@@ -1914,13 +1933,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
uffdio_continue.range.start) {
goto out;
}
- if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+ if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+ UFFDIO_CONTINUE_MODE_WP))
goto out;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
+ uffdio_continue.range.len,
+ &ctx->mmap_changing, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1988,6 +2010,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 54c774af6e1c..15d1e5a7c2d3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -286,8 +286,7 @@ xfs_buf_free_pages(
if (bp->b_pages[i])
__free_page(bp->b_pages[i]);
}
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += bp->b_page_count;
+ mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
kmem_free(bp->b_pages);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 863289aaa441..aede746541f8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1389,25 +1389,10 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
-static vm_fault_t
-xfs_filemap_map_pages(
- struct vm_fault *vmf,
- pgoff_t start_pgoff,
- pgoff_t end_pgoff)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- return ret;
-}
-
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = xfs_filemap_map_pages,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};