summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c2
-rw-r--r--fs/Kconfig7
-rw-r--r--fs/affs/file.c77
-rw-r--r--fs/affs/symlink.c12
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/btrfs/misc.h2
-rw-r--r--fs/buffer.c36
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/dax.c33
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/erofs/data.c6
-rw-r--r--fs/exec.c1
-rw-r--r--fs/ext2/balloc.c2
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c11
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/move_extent.c19
-rw-r--r--fs/fuse/dax.c20
-rw-r--r--fs/hugetlbfs/inode.c57
-rw-r--r--fs/jbd2/journal.c35
-rw-r--r--fs/nfs/fscache.c3
-rw-r--r--fs/ntfs3/inode.c10
-rw-r--r--fs/ocfs2/file.c7
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/task_mmu.c26
-rw-r--r--fs/proc/task_nommu.c15
-rw-r--r--fs/smb/client/fscache.c2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/udf/file.c6
-rw-r--r--fs/ufs/util.h6
-rw-r--r--fs/userfaultfd.c140
-rw-r--r--fs/xfs/xfs_file.c24
-rw-r--r--fs/xfs/xfs_trace.h20
36 files changed, 306 insertions, 306 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index cebba4eaa0b5..12c0ae29f185 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
&path, sizeof(path),
&version, sizeof(version),
i_size_read(&v9inode->netfs.inode));
+ if (v9inode->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9fs_inode_cookie(v9inode));
diff --git a/fs/Kconfig b/fs/Kconfig
index 7da21f563192..4f8bd14df0df 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -169,6 +169,7 @@ source "fs/sysfs/Kconfig"
config TMPFS
bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
+ select MEMFD_CREATE
help
Tmpfs is a file system which keeps all files in virtual memory.
@@ -252,6 +253,7 @@ config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
+ select MEMFD_CREATE
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -264,7 +266,7 @@ config HUGETLB_PAGE
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on ARCH_WANT_OPTIMIZE_VMEMMAP
+ depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
@@ -276,9 +278,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
(boot command line) or hugetlb_optimize_vmemmap (sysctl).
-config MEMFD_CREATE
- def_bool TMPFS || HUGETLBFS
-
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 472e2bdd5349..04c018e19602 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -523,21 +523,20 @@ affs_getemptyblk_ino(struct inode *inode, int block)
return ERR_PTR(err);
}
-static int
-affs_do_readpage_ofs(struct page *page, unsigned to, int create)
+static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
- unsigned pos = 0;
- u32 bidx, boff, bsize;
+ size_t pos = 0;
+ size_t bidx, boff, bsize;
u32 tmp;
- pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
- page->index, to);
- BUG_ON(to > PAGE_SIZE);
+ pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino,
+ folio->index, to);
+ BUG_ON(to > folio_size(folio));
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = page->index << PAGE_SHIFT;
+ tmp = folio_pos(folio);
bidx = tmp / bsize;
boff = tmp % bsize;
@@ -547,7 +546,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
- memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp);
+ memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
pos += tmp;
@@ -627,25 +626,23 @@ out:
return PTR_ERR(bh);
}
-static int
-affs_read_folio_ofs(struct file *file, struct folio *folio)
+static int affs_read_folio_ofs(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
- u32 to;
+ struct inode *inode = folio->mapping->host;
+ size_t to;
int err;
- pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
- to = PAGE_SIZE;
- if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) {
- to = inode->i_size & ~PAGE_MASK;
- memset(page_address(page) + to, 0, PAGE_SIZE - to);
+ pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index);
+ to = folio_size(folio);
+ if (folio_pos(folio) + to > inode->i_size) {
+ to = inode->i_size - folio_pos(folio);
+ folio_zero_segment(folio, to, folio_size(folio));
}
- err = affs_do_readpage_ofs(page, to, 0);
+ err = affs_do_read_folio_ofs(folio, to, 0);
if (!err)
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return err;
}
@@ -654,7 +651,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- struct page *page;
+ struct folio *folio;
pgoff_t index;
int err = 0;
@@ -670,19 +667,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *pagep = &folio->page;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_SIZE, 1);
+ err = affs_do_read_folio_ofs(folio, folio_size(folio), 1);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
return err;
}
@@ -691,6 +689,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh, *prev_bh;
@@ -704,18 +703,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
to = from + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
- * we don't have to, because the page should always be uptodate here,
+ * we don't have to, because the folio should always be uptodate here,
* due to write_begin.
*/
pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
pos + len);
bsize = AFFS_SB(sb)->s_data_blksize;
- data = page_address(page);
+ data = folio_address(folio);
bh = NULL;
written = 0;
- tmp = (page->index << PAGE_SHIFT) + from;
+ tmp = (folio->index << PAGE_SHIFT) + from;
bidx = tmp / bsize;
boff = tmp % bsize;
if (boff) {
@@ -807,11 +806,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
from += tmp;
bidx++;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
done:
affs_brelse(bh);
- tmp = (page->index << PAGE_SHIFT) + from;
+ tmp = (folio->index << PAGE_SHIFT) + from;
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
@@ -822,8 +821,8 @@ done:
}
err_first_bh:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return written;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 31d6446dc166..094aec8d17b8 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -13,10 +13,9 @@
static int affs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
- char *link = page_address(page);
+ struct inode *inode = folio->mapping->host;
+ char *link = folio_address(folio);
struct slink_front *lf;
int i, j;
char c;
@@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio)
}
link[i] = '\0';
affs_brelse(bh);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_unlock(folio);
return -EIO;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9d3d64921106..da73b97e19a9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
{
#ifdef CONFIG_AFS_FSCACHE
vnode->netfs.cache = cookie;
+ if (cookie)
+ mapping_set_release_always(vnode->netfs.inode.i_mapping);
#endif
}
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 005751a12911..40f2d9f1a17a 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -8,8 +8,6 @@
#include <linux/math64.h>
#include <linux/rbtree.h>
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
-
/*
* Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
*/
diff --git a/fs/buffer.c b/fs/buffer.c
index 084a6ade108a..0f17c36922e6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1539,21 +1539,6 @@ void invalidate_bh_lrus_cpu(void)
bh_lru_unlock();
}
-void set_bh_page(struct buffer_head *bh,
- struct page *page, unsigned long offset)
-{
- bh->b_page = page;
- BUG_ON(offset >= PAGE_SIZE);
- if (PageHighMem(page))
- /*
- * This catches illegal uses and preserves the offset:
- */
- bh->b_data = (char *)(0 + offset);
- else
- bh->b_data = page_address(page) + offset;
-}
-EXPORT_SYMBOL(set_bh_page);
-
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset)
{
@@ -2180,8 +2165,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(__block_write_begin);
-static int __block_commit_write(struct inode *inode, struct folio *folio,
- size_t from, size_t to)
+static void __block_commit_write(struct folio *folio, size_t from, size_t to)
{
size_t block_start, block_end;
bool partial = false;
@@ -2216,7 +2200,6 @@ static int __block_commit_write(struct inode *inode, struct folio *folio,
*/
if (!partial)
folio_mark_uptodate(folio);
- return 0;
}
/*
@@ -2253,7 +2236,6 @@ int block_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct folio *folio = page_folio(page);
- struct inode *inode = mapping->host;
size_t start = pos - folio_pos(folio);
if (unlikely(copied < len)) {
@@ -2277,7 +2259,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
flush_dcache_folio(folio);
/* This could be a short (even 0-length) commit */
- __block_commit_write(inode, folio, start, start + copied);
+ __block_commit_write(folio, start, start + copied);
return copied;
}
@@ -2598,12 +2580,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(cont_write_begin);
-int block_commit_write(struct page *page, unsigned from, unsigned to)
+void block_commit_write(struct page *page, unsigned from, unsigned to)
{
struct folio *folio = page_folio(page);
- struct inode *inode = folio->mapping->host;
- __block_commit_write(inode, folio, from, to);
- return 0;
+ __block_commit_write(folio, from, to);
}
EXPORT_SYMBOL(block_commit_write);
@@ -2649,11 +2629,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
end = size - folio_pos(folio);
ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
- if (!ret)
- ret = __block_commit_write(inode, folio, 0, end);
-
- if (unlikely(ret < 0))
+ if (unlikely(ret))
goto out_unlock;
+
+ __block_commit_write(folio, 0, end);
+
folio_mark_dirty(folio);
folio_wait_stable(folio);
return 0;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d9d22d0ec38a..7bf7a5fcc045 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
if (ret < 0)
goto check_failed;
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags);
+
object->file = file;
/* Always update the atime on an object we've just looked up (this is
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 177d8e8d73fe..de1dee46d3df 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
&ci->i_vino, sizeof(ci->i_vino),
&ci->i_version, sizeof(ci->i_version),
i_size_read(inode));
+ if (ci->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
diff --git a/fs/dax.c b/fs/dax.c
index 906ecbd541a3..8fafecbe42b1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -30,17 +30,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
-static inline unsigned int pe_order(enum page_entry_size pe_size)
-{
- if (pe_size == PE_SIZE_PTE)
- return PAGE_SHIFT - PAGE_SHIFT;
- if (pe_size == PE_SIZE_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
- if (pe_size == PE_SIZE_PUD)
- return PUD_SHIFT - PAGE_SHIFT;
- return ~0;
-}
-
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -49,9 +38,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size)
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
-/* The order of a PMD entry */
-#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
-
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -1908,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
/**
* dax_iomap_fault - handle a page fault on a DAX file
* @vmf: The description of the fault
- * @pe_size: Size of the page to fault in
+ * @order: Order of the page to fault in
* @pfnp: PFN to insert for synchronous faults if fsync is required
* @iomap_errp: Storage for detailed error code in case of error
* @ops: Iomap ops passed from the file system
@@ -1918,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* has done all the necessary locking for page fault to proceed
* successfully.
*/
-vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
{
- switch (pe_size) {
- case PE_SIZE_PTE:
+ if (order == 0)
return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
- case PE_SIZE_PMD:
+ else if (order == PMD_ORDER)
return dax_iomap_pmd_fault(vmf, pfnp, ops);
- default:
+ else
return VM_FAULT_FALLBACK;
- }
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
@@ -1979,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
/**
* dax_finish_sync_fault - finish synchronous page fault
* @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
+ * @order: Order of entry to be inserted
* @pfn: PFN to insert
*
* This function ensures that the file range touched by the page fault is
* stored persistently on the media and handles inserting of appropriate page
* table entry.
*/
-vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size, pfn_t pfn)
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
+ pfn_t pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
- unsigned int order = pe_order(pe_size);
size_t len = PAGE_SIZE << order;
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index e619c31b6bd9..b9575957a7c2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
+#include <linux/swap.h>
#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
@@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
static int stfu;
if (sysctl_drop_caches & 1) {
+ lru_add_drain_all();
iterate_supers(drop_pagecache_sb, NULL);
count_vm_event(DROP_PAGECACHE);
}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index db5e4b7636ec..0c2c99c58b5e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -413,14 +413,14 @@ const struct address_space_operations erofs_raw_access_aops = {
#ifdef CONFIG_FS_DAX
static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+ unsigned int order)
{
- return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+ return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops);
}
static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
- return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return erofs_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct erofs_dax_vm_ops = {
diff --git a/fs/exec.c b/fs/exec.c
index 1a827d55ba94..0b9484358a49 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
if (vma != vma_next(&vmi))
return -EFAULT;
+ vma_iter_prev_range(&vmi);
/*
* cover the whole range: [new_start, old_end)
*/
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index eca60b747c6b..c8049c90323d 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -36,8 +36,6 @@
*/
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
unsigned int block_group,
struct buffer_head ** bh)
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 0b4c91c62e1f..1039e5bf90af 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
}
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
+ ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops);
filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1e2259d9967d..481491e892df 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3780,8 +3780,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index c457c8517f0f..2dc3f8301225 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -723,8 +723,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
#ifdef CONFIG_FS_DAX
-static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
int error = 0;
vm_fault_t result;
@@ -740,7 +739,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
* read-only.
*
* We check for VM_SHARED rather than vmf->cow_page since the latter is
- * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ * unset for order != 0 (i.e. only in do_cow_fault); for
* other sizes, dax_iomap_fault will handle splitting / fallback so that
* we eventually come back with a COW page.
*/
@@ -764,7 +763,7 @@ retry:
} else {
filemap_invalidate_lock_shared(mapping);
}
- result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
+ result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
@@ -773,7 +772,7 @@ retry:
goto retry;
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
- result = dax_finish_sync_fault(vmf, pe_size, pfn);
+ result = dax_finish_sync_fault(vmf, order, pfn);
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
@@ -785,7 +784,7 @@ retry:
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
- return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return ext4_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6683076ecb2f..d3f581ced672 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
if (folio->index < mpd->first_page)
continue;
- if (folio->index + folio_nr_pages(folio) - 1 > end)
+ if (folio_next_index(folio) - 1 > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -2455,7 +2455,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->first_page = folio->index;
- mpd->next_page = folio->index + folio_nr_pages(folio);
+ mpd->next_page = folio_next_index(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b5af2fc03b2f..18a9e7c47975 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -340,10 +340,8 @@ again:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
- if ((folio_has_private(folio[0]) &&
- !filemap_release_folio(folio[0], 0)) ||
- (folio_has_private(folio[1]) &&
- !filemap_release_folio(folio[1], 0))) {
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
*err = -EBUSY;
goto drop_data_sem;
}
@@ -362,10 +360,8 @@ data_copy:
/* At this point all buffers in range are uptodate, old mapping layout
* is no longer required, try to drop it now. */
- if ((folio_has_private(folio[0]) &&
- !filemap_release_folio(folio[0], 0)) ||
- (folio_has_private(folio[1]) &&
- !filemap_release_folio(folio[1], 0))) {
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
*err = -EBUSY;
goto unlock_folios;
}
@@ -392,14 +388,11 @@ data_copy:
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
if (*err < 0)
- break;
+ goto repair_branches;
bh = bh->b_this_page;
}
- if (!*err)
- *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
- if (unlikely(*err < 0))
- goto repair_branches;
+ block_commit_write(&folio[0]->page, from, from + replaced_size);
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 8e74f278a3f6..23904a6a9a96 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping,
return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
}
-static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size, bool write)
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
+ bool write)
{
vm_fault_t ret;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -809,7 +809,7 @@ retry:
* to populate page cache or access memory we are trying to free.
*/
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops);
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
error = 0;
retry = true;
@@ -818,7 +818,7 @@ retry:
}
if (ret & VM_FAULT_NEEDDSYNC)
- ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ ret = dax_finish_sync_fault(vmf, order, pfn);
filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
@@ -829,24 +829,22 @@ retry:
static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
{
- return __fuse_dax_fault(vmf, PE_SIZE_PTE,
- vmf->flags & FAULT_FLAG_WRITE);
+ return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE);
}
-static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
- return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+ return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE);
}
static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
{
- return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+ return __fuse_dax_fault(vmf, 0, true);
}
static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
{
- return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+ return __fuse_dax_fault(vmf, 0, true);
}
static const struct vm_operations_struct fuse_dax_vm_ops = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 93d3bcfd4fc8..316c4cebd3f3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -283,6 +283,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
#endif
/*
+ * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
+ * Returns the maximum number of bytes one can read without touching the 1st raw
+ * HWPOISON subpage.
+ *
+ * The implementation borrows the iteration logic from copy_page_to_iter*.
+ */
+static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
+{
+ size_t n = 0;
+ size_t res = 0;
+
+ /* First subpage to start the loop. */
+ page += offset / PAGE_SIZE;
+ offset %= PAGE_SIZE;
+ while (1) {
+ if (is_raw_hwpoison_page_in_hugepage(page))
+ break;
+
+ /* Safe to read n bytes without touching HWPOISON subpage. */
+ n = min(bytes, (size_t)PAGE_SIZE - offset);
+ res += n;
+ bytes -= n;
+ if (!bytes || !n)
+ break;
+ offset += n;
+ if (offset == PAGE_SIZE) {
+ page++;
+ offset = 0;
+ }
+ }
+
+ return res;
+}
+
+/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. This provides functionality similar to filemap_read().
*/
@@ -300,7 +335,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
while (iov_iter_count(to)) {
struct page *page;
- size_t nr, copied;
+ size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
@@ -328,16 +363,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
- if (PageHWPoison(page)) {
- put_page(page);
- retval = -EIO;
- break;
+ if (!PageHWPoison(page))
+ want = nr;
+ else {
+ /*
+ * Adjust how many bytes safe to read without
+ * touching the 1st raw HWPOISON subpage after
+ * offset.
+ */
+ want = adjust_range_hwpoison(page, offset, nr);
+ if (want == 0) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
}
/*
* We have the page, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, nr, to);
+ copied = copy_page_to_iter(page, offset, want, to);
put_page(page);
}
offset += copied;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fbce16fedaa4..1b5a45ab62b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -341,7 +341,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct page *new_page;
+ struct folio *new_folio;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;
@@ -370,14 +370,14 @@ repeat:
*/
if (jh_in->b_frozen_data) {
done_copy_out = 1;
- new_page = virt_to_page(jh_in->b_frozen_data);
- new_offset = offset_in_page(jh_in->b_frozen_data);
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ new_folio = jh2bh(jh_in)->b_folio;
+ new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
/*
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
@@ -385,18 +385,17 @@ repeat:
* data in the buffer.
*/
if (!done_copy_out)
- jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers);
/*
* Check for escaping
*/
- if (*((__be32 *)(mapped_data + new_offset)) ==
- cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data);
+ kunmap_local(mapped_data);
/*
* Do we need to do a data copy?
@@ -417,12 +416,10 @@ repeat:
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
- kunmap_atomic(mapped_data);
+ memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
- new_page = virt_to_page(tmp);
- new_offset = offset_in_page(tmp);
+ new_folio = virt_to_folio(tmp);
+ new_offset = offset_in_folio(new_folio, tmp);
done_copy_out = 1;
/*
@@ -438,12 +435,12 @@ repeat:
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page);
- *((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
+ *((unsigned int *)mapped_data) = 0;
+ kunmap_local(mapped_data);
}
- set_bh_page(new_bh, new_page, new_offset);
+ folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 8c35d88a84b1..b05717fe0d4e 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode)
&auxdata, /* aux_data */
sizeof(auxdata),
i_size_read(inode));
+
+ if (netfs_inode(inode)->cache)
+ mapping_set_release_always(inode->i_mapping);
}
/*
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 4123e126c4d0..eb2ed0701495 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -556,7 +556,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
u8 cluster_bits = sbi->cluster_bits;
u32 block_size = sb->s_blocksize;
u64 bytes, lbo, valid;
@@ -571,7 +571,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
if (is_resident(ni)) {
ni_lock(ni);
- err = attr_data_read_resident(ni, page);
+ err = attr_data_read_resident(ni, &folio->page);
ni_unlock(ni);
if (!err)
@@ -644,17 +644,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
*/
bytes = block_size;
- if (page) {
+ if (folio) {
u32 voff = valid - vbo;
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
- set_bh_page(bh, page, off);
+ folio_set_bh(bh, folio, off);
err = bh_read(bh, 0);
if (err < 0)
goto out;
- zero_user_segment(page, off + voff, off + block_size);
+ folio_zero_segment(folio, off + voff, off + block_size);
}
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3b91b4cc7c6a..c45596c25c66 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -810,12 +810,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/* must not update i_size! */
- ret = block_commit_write(page, block_start + 1,
- block_start + 1);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ block_commit_write(page, block_start + 1, block_start + 1);
}
/*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7576effe8d52..02e1cf03d94b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8dca4d6d96c7..45af9a989d40 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -17,6 +17,7 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#include <linux/zswap.h>
#include <asm/page.h>
#include "internal.h"
@@ -132,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
-#ifdef CONFIG_MEMTEST
- if (early_memtest_done) {
- unsigned long early_memtest_bad_size_kb;
-
- early_memtest_bad_size_kb = early_memtest_bad_size>>10;
- if (early_memtest_bad_size && !early_memtest_bad_size_kb)
- early_memtest_bad_size_kb = 1;
- /* When 0 is reported, it means there actually was a successful test */
- seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
- }
-#endif
+ memtest_report_meminfo(m);
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fafff1bd34cd..15ddf4653a19 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -236,21 +236,6 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-/*
- * Indicate if the VMA is a stack for the given task; for
- * /proc/PID/maps that is the stack of the main task.
- */
-static int is_stack(struct vm_area_struct *vma)
-{
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
-}
-
static void show_vma_header_prefix(struct seq_file *m,
unsigned long start, unsigned long end,
vm_flags_t flags, unsigned long long pgoff,
@@ -327,13 +312,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
+ if (vma_is_initial_heap(vma)) {
name = "[heap]";
goto done;
}
- if (is_stack(vma)) {
+ if (vma_is_initial_stack(vma)) {
name = "[stack]";
goto done;
}
@@ -871,7 +855,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
+ seq_printf(m, "THPeligible: %8u\n",
hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
@@ -1975,9 +1959,9 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
seq_file_path(m, file, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ } else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
- } else if (is_stack(vma)) {
+ } else if (vma_is_initial_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 2c8b62265981..a8ac0dd8041e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
-}
-
/*
* display a single VMA to a sequenced file
*/
@@ -171,7 +158,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm && is_stack(vma)) {
+ } else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
}
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index 8f6909d633da..3677525ee993 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
i_size_read(&cifsi->netfs.inode));
+ if (cifsi->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
}
void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
diff --git a/fs/splice.c b/fs/splice.c
index 02631013b09f..d983d375ff11 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
*/
folio_wait_writeback(folio);
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL))
+ if (!filemap_release_folio(folio, GFP_KERNEL))
goto out_unlock;
/*
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 243840dc83ad..0292d75e60cc 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
else
end = PAGE_SIZE;
err = __block_write_begin(page, 0, end, udf_get_block);
- if (!err)
- err = block_commit_write(page, 0, end);
- if (err < 0) {
+ if (err) {
unlock_page(page);
ret = block_page_mkwrite_return(err);
goto out_unlock;
}
+
+ block_commit_write(page, 0, end);
out_dirty:
set_page_dirty(page);
wait_for_stable_page(page);
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 4931bec1a01c..89247193d96d 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -11,12 +11,6 @@
#include <linux/fs.h>
#include "swab.h"
-
-/*
- * some useful macros
- */
-#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len))
-
/*
* functions used for retyping
*/
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..56eaae9dac1a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -277,17 +277,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
+ struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
bool ret = true;
- mmap_assert_locked(ctx->mm);
+ assert_fault_locked(vmf);
- ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
+ ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
@@ -308,10 +307,8 @@ out:
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
return false; /* should never get here */
}
@@ -325,11 +322,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
* threads.
*/
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
- unsigned long address,
- unsigned long flags,
+ struct vm_fault *vmf,
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
+ unsigned long address = vmf->address;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -338,7 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte_t ptent;
bool ret = true;
- mmap_assert_locked(mm);
+ assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -427,20 +424,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
- * follow_hugetlb_page() to skip missing pages in the
+ * hugetlb_follow_page_mask() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
- * coredumping without mmap_lock and it ends up here.
+ * coredumping and it ends up here.
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
- /*
- * Coredumping runs without mmap_lock so we can only check that
- * the mmap_lock is held, if PF_DUMPCORE was not set.
- */
- mmap_assert_locked(mm);
+ assert_fault_locked(vmf);
ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
@@ -556,15 +549,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
- reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vma,
- vmf->address,
- vmf->flags, reason);
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
- mmap_read_unlock(mm);
+ release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
@@ -667,6 +657,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma,
vma->vm_flags & ~__VM_UFFD_FLAGS);
@@ -702,6 +693,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
@@ -783,6 +775,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
@@ -940,6 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
}
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1289,13 +1283,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
__wake_userfault(ctx, range);
}
-static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+static __always_inline int validate_unaligned_range(
+ struct mm_struct *mm, __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK)
- return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
@@ -1306,9 +1298,20 @@ static __always_inline int validate_range(struct mm_struct *mm,
return -EINVAL;
if (len > task_size - start)
return -EINVAL;
+ if (start + len <= start)
+ return -EINVAL;
return 0;
}
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ return validate_unaligned_range(mm, start, len);
+}
+
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1502,6 +1505,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
@@ -1685,6 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -1757,17 +1762,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
+ ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
+ uffdio_copy.len);
+ if (ret)
+ goto out;
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
- /*
- * double check for wraparound just in case. copy_from_user()
- * will later check uffdio_copy.src + uffdio_copy.len to fit
- * in the userland range.
- */
+
ret = -EINVAL;
- if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
- goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
@@ -1927,11 +1930,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
ret = -EINVAL;
- /* double check for wraparound just in case. */
- if (uffdio_continue.range.start + uffdio_continue.range.len <=
- uffdio_continue.range.start) {
- goto out;
- }
if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
UFFDIO_CONTINUE_MODE_WP))
goto out;
@@ -1965,6 +1963,61 @@ out:
return ret;
}
+static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_poison uffdio_poison;
+ struct uffdio_poison __user *user_uffdio_poison;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_poison = (struct uffdio_poison __user *)arg;
+
+ ret = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_poison, user_uffdio_poison,
+ /* don't copy the output fields */
+ sizeof(uffdio_poison) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len,
+ &ctx->mmap_changing, 0);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
+ range.start = uffdio_poison.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2066,6 +2119,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_CONTINUE:
ret = userfaultfd_continue(ctx, arg);
break;
+ case UFFDIO_POISON:
+ ret = userfaultfd_poison(ctx, arg);
+ break;
}
return ret;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4f502219ae4f..203700278ddb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1287,11 +1287,11 @@ xfs_file_llseek(
static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size,
+ unsigned int order,
bool write_fault,
pfn_t *pfn)
{
- return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ return dax_iomap_fault(vmf, order, pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_dax_write_iomap_ops :
&xfs_read_iomap_ops);
@@ -1300,7 +1300,7 @@ xfs_dax_fault(
static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size,
+ unsigned int order,
bool write_fault,
pfn_t *pfn)
{
@@ -1322,14 +1322,14 @@ xfs_dax_fault(
static vm_fault_t
__xfs_filemap_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size,
+ unsigned int order,
bool write_fault)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
vm_fault_t ret;
- trace_xfs_filemap_fault(ip, pe_size, write_fault);
+ trace_xfs_filemap_fault(ip, order, write_fault);
if (write_fault) {
sb_start_pagefault(inode->i_sb);
@@ -1340,9 +1340,9 @@ __xfs_filemap_fault(
pfn_t pfn;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
+ ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
- ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ ret = dax_finish_sync_fault(vmf, order, pfn);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
if (write_fault) {
@@ -1373,7 +1373,7 @@ xfs_filemap_fault(
struct vm_fault *vmf)
{
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+ return __xfs_filemap_fault(vmf, 0,
IS_DAX(file_inode(vmf->vma->vm_file)) &&
xfs_is_write_fault(vmf));
}
@@ -1381,13 +1381,13 @@ xfs_filemap_fault(
static vm_fault_t
xfs_filemap_huge_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size)
+ unsigned int order)
{
if (!IS_DAX(file_inode(vmf->vma->vm_file)))
return VM_FAULT_FALLBACK;
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, pe_size,
+ return __xfs_filemap_fault(vmf, order,
xfs_is_write_fault(vmf));
}
@@ -1395,7 +1395,7 @@ static vm_fault_t
xfs_filemap_page_mkwrite(
struct vm_fault *vmf)
{
- return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
+ return __xfs_filemap_fault(vmf, 0, true);
}
/*
@@ -1408,7 +1408,7 @@ xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
- return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
+ return __xfs_filemap_fault(vmf, 0, true);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f3cc204bb4bf..fd789e00dfd6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -802,36 +802,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating);
* ring buffer. Somehow this was only worth mentioning in the ftrace sample
* code.
*/
-TRACE_DEFINE_ENUM(PE_SIZE_PTE);
-TRACE_DEFINE_ENUM(PE_SIZE_PMD);
-TRACE_DEFINE_ENUM(PE_SIZE_PUD);
-
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
TRACE_EVENT(xfs_filemap_fault,
- TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
- bool write_fault),
- TP_ARGS(ip, pe_size, write_fault),
+ TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault),
+ TP_ARGS(ip, order, write_fault),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(enum page_entry_size, pe_size)
+ __field(unsigned int, order)
__field(bool, write_fault)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->pe_size = pe_size;
+ __entry->order = order;
__entry->write_fault = write_fault;
),
- TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
+ TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __print_symbolic(__entry->pe_size,
- { PE_SIZE_PTE, "PTE" },
- { PE_SIZE_PMD, "PMD" },
- { PE_SIZE_PUD, "PUD" }),
+ __entry->order,
__entry->write_fault)
)