summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-damon29
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst13
-rw-r--r--Documentation/admin-guide/mm/damon/reclaim.rst9
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst48
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst6
-rw-r--r--Documentation/admin-guide/mm/idle_page_tracking.rst2
-rw-r--r--Documentation/admin-guide/mm/numaperf.rst8
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst4
-rw-r--r--Documentation/core-api/pin_user_pages.rst2
-rw-r--r--Documentation/dev-tools/kasan.rst17
-rw-r--r--Documentation/mm/multigen_lru.rst8
-rw-r--r--Documentation/mm/page_owner.rst2
-rw-r--r--Documentation/mm/slub.rst2
-rw-r--r--Documentation/translations/zh_CN/mm/page_owner.rst2
-rw-r--r--MAINTAINERS9
-rw-r--r--arch/arm64/kernel/vdso.c6
-rw-r--r--arch/powerpc/kernel/vdso.c4
-rw-r--r--arch/powerpc/platforms/book3s/vas-api.c2
-rw-r--r--arch/powerpc/platforms/pseries/vas.c3
-rw-r--r--arch/riscv/kernel/vdso.c6
-rw-r--r--arch/s390/kernel/vdso.c4
-rw-r--r--arch/s390/mm/gmap.c2
-rw-r--r--arch/x86/entry/vdso/vma.c4
-rw-r--r--arch/x86/include/asm/pgtable.h24
-rw-r--r--drivers/android/binder_alloc.c2
-rw-r--r--drivers/block/zram/zram_drv.c8
-rw-r--r--drivers/char/mem.c2
-rw-r--r--drivers/misc/open-dice.c14
-rw-r--r--fs/buffer.c54
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/ext4/move_extent.c46
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/hugetlbfs/inode.c28
-rw-r--r--fs/jbd2/commit.c33
-rw-r--r--fs/jbd2/journal.c3
-rw-r--r--fs/mpage.c16
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/btree.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/mdt.c4
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/ntfs3/inode.c33
-rw-r--r--fs/ocfs2/journal.c16
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/reiserfs/journal.c4
-rw-r--r--fs/reiserfs/tail_conversion.c2
-rw-r--r--fs/romfs/mmap-nommu.c2
-rw-r--r--fs/userfaultfd.c26
-rw-r--r--include/asm-generic/hugetlb.h16
-rw-r--r--include/linux/buffer_head.h5
-rw-r--r--include/linux/damon.h51
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/highmem.h9
-rw-r--r--include/linux/hugetlb.h73
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/kasan.h36
-rw-r--r--include/linux/maple_tree.h6
-rw-r--r--include/linux/memcontrol.h50
-rw-r--r--include/linux/mm.h140
-rw-r--r--include/linux/mm_inline.h36
-rw-r--r--include/linux/mm_types.h77
-rw-r--r--include/linux/mmzone.h131
-rw-r--r--include/linux/pagewalk.h11
-rw-r--r--include/linux/pid_namespace.h19
-rw-r--r--include/linux/slab_def.h2
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/linux/swapops.h6
-rw-r--r--include/linux/userfaultfd_k.h2
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/cma.h32
-rw-r--r--include/uapi/linux/fcntl.h1
-rw-r--r--include/uapi/linux/memfd.h4
-rw-r--r--io_uring/io_uring.c2
-rw-r--r--kernel/pid_namespace.c5
-rw-r--r--kernel/pid_sysctl.h60
-rw-r--r--lib/maple_tree.c115
-rw-r--r--lib/test_vmalloc.c8
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/Kconfig.debug2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/damon/core.c39
-rw-r--r--mm/damon/ops-common.c38
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c163
-rw-r--r--mm/damon/reclaim.c19
-rw-r--r--mm/damon/sysfs-schemes.c373
-rw-r--r--mm/damon/vaddr.c68
-rw-r--r--mm/debug_vm_pgtable.c102
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/hmm.c15
-rw-r--r--mm/huge_memory.c64
-rw-r--r--mm/hugetlb.c114
-rw-r--r--mm/internal.h19
-rw-r--r--mm/kasan/common.c18
-rw-r--r--mm/kasan/hw_tags.c60
-rw-r--r--mm/kasan/kasan.h27
-rw-r--r--mm/madvise.c100
-rw-r--r--mm/memcontrol.c78
-rw-r--r--mm/memfd.c56
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c55
-rw-r--r--mm/mempolicy.c35
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mprotect.c105
-rw-r--r--mm/nommu.c62
-rw-r--r--mm/page-writeback.c59
-rw-r--r--mm/page_alloc.c60
-rw-r--r--mm/page_idle.c47
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/page_reporting.c6
-rw-r--r--mm/page_vma_mapped.c9
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/rmap.c68
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h8
-rw-r--r--mm/slab_common.c1
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap.c26
-rw-r--r--mm/swap_state.c59
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/userfaultfd.c49
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c85
-rw-r--r--mm/vmscan.c738
-rw-r--r--mm/workingset.c7
-rw-r--r--mm/z3fold.c2
-rw-r--r--mm/zsmalloc.c3
-rw-r--r--net/ipv4/tcp.c7
-rw-r--r--security/apparmor/policy_unpack.c11
-rw-r--r--tools/cgroup/memcg_shrinker.py3
-rw-r--r--tools/mm/.gitignore (renamed from tools/vm/.gitignore)0
-rw-r--r--tools/mm/Makefile (renamed from tools/vm/Makefile)0
-rw-r--r--tools/mm/page-types.c (renamed from tools/vm/page-types.c)0
-rw-r--r--tools/mm/page_owner_sort.c (renamed from tools/vm/page_owner_sort.c)65
-rw-r--r--tools/mm/slabinfo-gnuplot.sh (renamed from tools/vm/slabinfo-gnuplot.sh)0
-rw-r--r--tools/mm/slabinfo.c (renamed from tools/vm/slabinfo.c)0
-rw-r--r--tools/testing/radix-tree/maple.c18
-rw-r--r--tools/testing/selftests/Makefile2
-rw-r--r--tools/testing/selftests/damon/sysfs.sh29
-rwxr-xr-xtools/testing/selftests/kselftest_deps.sh6
-rw-r--r--tools/testing/selftests/memfd/fuse_test.c1
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c341
-rw-r--r--tools/testing/selftests/mm/.gitignore (renamed from tools/testing/selftests/vm/.gitignore)0
-rw-r--r--tools/testing/selftests/mm/Makefile (renamed from tools/testing/selftests/vm/Makefile)8
-rw-r--r--tools/testing/selftests/mm/charge_reserved_hugetlb.sh (renamed from tools/testing/selftests/vm/charge_reserved_hugetlb.sh)0
-rw-r--r--tools/testing/selftests/mm/check_config.sh (renamed from tools/testing/selftests/vm/check_config.sh)0
-rw-r--r--tools/testing/selftests/mm/compaction_test.c (renamed from tools/testing/selftests/vm/compaction_test.c)0
-rw-r--r--tools/testing/selftests/mm/config (renamed from tools/testing/selftests/vm/config)0
-rw-r--r--tools/testing/selftests/mm/cow.c (renamed from tools/testing/selftests/vm/cow.c)231
-rw-r--r--tools/testing/selftests/mm/gup_test.c (renamed from tools/testing/selftests/vm/gup_test.c)0
-rw-r--r--tools/testing/selftests/mm/hmm-tests.c (renamed from tools/testing/selftests/vm/hmm-tests.c)0
-rw-r--r--tools/testing/selftests/mm/hugepage-mmap.c (renamed from tools/testing/selftests/vm/hugepage-mmap.c)0
-rw-r--r--tools/testing/selftests/mm/hugepage-mremap.c (renamed from tools/testing/selftests/vm/hugepage-mremap.c)0
-rw-r--r--tools/testing/selftests/mm/hugepage-shm.c (renamed from tools/testing/selftests/vm/hugepage-shm.c)0
-rw-r--r--tools/testing/selftests/mm/hugepage-vmemmap.c (renamed from tools/testing/selftests/vm/hugepage-vmemmap.c)0
-rw-r--r--tools/testing/selftests/mm/hugetlb-madvise.c (renamed from tools/testing/selftests/vm/hugetlb-madvise.c)0
-rw-r--r--tools/testing/selftests/mm/hugetlb_reparenting_test.sh (renamed from tools/testing/selftests/vm/hugetlb_reparenting_test.sh)0
-rw-r--r--tools/testing/selftests/mm/khugepaged.c (renamed from tools/testing/selftests/vm/khugepaged.c)0
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c (renamed from tools/testing/selftests/vm/ksm_functional_tests.c)2
-rw-r--r--tools/testing/selftests/mm/ksm_tests.c (renamed from tools/testing/selftests/vm/ksm_tests.c)0
-rw-r--r--tools/testing/selftests/mm/madv_populate.c (renamed from tools/testing/selftests/vm/madv_populate.c)0
-rw-r--r--tools/testing/selftests/mm/map_fixed_noreplace.c (renamed from tools/testing/selftests/vm/map_fixed_noreplace.c)0
-rw-r--r--tools/testing/selftests/mm/map_hugetlb.c (renamed from tools/testing/selftests/vm/map_hugetlb.c)0
-rw-r--r--tools/testing/selftests/mm/map_populate.c (renamed from tools/testing/selftests/vm/map_populate.c)0
-rw-r--r--tools/testing/selftests/mm/memfd_secret.c (renamed from tools/testing/selftests/vm/memfd_secret.c)0
-rw-r--r--tools/testing/selftests/mm/migration.c (renamed from tools/testing/selftests/vm/migration.c)0
-rw-r--r--tools/testing/selftests/mm/mlock-random-test.c (renamed from tools/testing/selftests/vm/mlock-random-test.c)0
-rw-r--r--tools/testing/selftests/mm/mlock2-tests.c (renamed from tools/testing/selftests/vm/mlock2-tests.c)0
-rw-r--r--tools/testing/selftests/mm/mlock2.h (renamed from tools/testing/selftests/vm/mlock2.h)0
-rw-r--r--tools/testing/selftests/mm/mrelease_test.c (renamed from tools/testing/selftests/vm/mrelease_test.c)0
-rw-r--r--tools/testing/selftests/mm/mremap_dontunmap.c (renamed from tools/testing/selftests/vm/mremap_dontunmap.c)0
-rw-r--r--tools/testing/selftests/mm/mremap_test.c (renamed from tools/testing/selftests/vm/mremap_test.c)119
-rw-r--r--tools/testing/selftests/mm/on-fault-limit.c (renamed from tools/testing/selftests/vm/on-fault-limit.c)0
-rw-r--r--tools/testing/selftests/mm/pkey-helpers.h (renamed from tools/testing/selftests/vm/pkey-helpers.h)0
-rw-r--r--tools/testing/selftests/mm/pkey-powerpc.h (renamed from tools/testing/selftests/vm/pkey-powerpc.h)0
-rw-r--r--tools/testing/selftests/mm/pkey-x86.h (renamed from tools/testing/selftests/vm/pkey-x86.h)0
-rw-r--r--tools/testing/selftests/mm/protection_keys.c (renamed from tools/testing/selftests/vm/protection_keys.c)0
-rw-r--r--[-rwxr-xr-x]tools/testing/selftests/mm/run_vmtests.sh (renamed from tools/testing/selftests/vm/run_vmtests.sh)0
-rw-r--r--tools/testing/selftests/mm/settings (renamed from tools/testing/selftests/vm/settings)0
-rw-r--r--tools/testing/selftests/mm/soft-dirty.c (renamed from tools/testing/selftests/vm/soft-dirty.c)0
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c (renamed from tools/testing/selftests/vm/split_huge_page_test.c)0
-rw-r--r--[-rwxr-xr-x]tools/testing/selftests/mm/test_hmm.sh (renamed from tools/testing/selftests/vm/test_hmm.sh)0
-rw-r--r--[-rwxr-xr-x]tools/testing/selftests/mm/test_vmalloc.sh (renamed from tools/testing/selftests/vm/test_vmalloc.sh)0
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c (renamed from tools/testing/selftests/vm/thuge-gen.c)0
-rw-r--r--tools/testing/selftests/mm/transhuge-stress.c (renamed from tools/testing/selftests/vm/transhuge-stress.c)0
-rw-r--r--tools/testing/selftests/mm/userfaultfd.c (renamed from tools/testing/selftests/vm/userfaultfd.c)0
-rw-r--r--tools/testing/selftests/mm/util.h (renamed from tools/testing/selftests/vm/util.h)0
-rw-r--r--tools/testing/selftests/mm/va_128TBswitch.c (renamed from tools/testing/selftests/vm/va_128TBswitch.c)0
-rw-r--r--[-rwxr-xr-x]tools/testing/selftests/mm/va_128TBswitch.sh (renamed from tools/testing/selftests/vm/va_128TBswitch.sh)0
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c (renamed from tools/testing/selftests/vm/virtual_address_range.c)0
-rw-r--r--tools/testing/selftests/mm/vm_util.c (renamed from tools/testing/selftests/vm/vm_util.c)0
-rw-r--r--tools/testing/selftests/mm/vm_util.h (renamed from tools/testing/selftests/vm/vm_util.h)0
-rw-r--r--tools/testing/selftests/mm/write_hugetlb_memory.sh (renamed from tools/testing/selftests/vm/write_hugetlb_memory.sh)0
-rw-r--r--tools/testing/selftests/mm/write_to_hugetlbfs.c (renamed from tools/testing/selftests/vm/write_to_hugetlbfs.c)0
201 files changed, 3744 insertions, 1475 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 13397b853692..2744f21b5a6b 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -258,6 +258,35 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the low
watermark of the scheme in permil.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/nr_filters
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for setting filters of the scheme named '0' to
+ 'N-1' under the filters/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/type
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the type of
+ the memory of the interest. 'anon' for anonymous pages, or
+ 'memcg' for specific memory cgroup can be written and read.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: If 'memcg' is written to the 'type' file, writing to and
+ reading from this file sets and gets the path to the memory
+ cgroup of the interest.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing 'Y' or 'N' to this file sets whether to filter out
+ pages that do or do not match to the 'type' and 'memcg_path',
+ respectively. Filter out means the action of the scheme will
+ not be applied to.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 60370f2c67b9..258e45cc3b2d 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -86,6 +86,8 @@ Brief summary of control files.
memory.swappiness set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
memory.move_charge_at_immigrate set/show controls of moving charges
+ This knob is deprecated and shouldn't be
+ used.
memory.oom_control set/show oom controls.
memory.numa_stat show the number of memory usage per numa
node
@@ -717,8 +719,15 @@ NOTE2:
It is recommended to set the soft limit always below the hard limit,
otherwise the hard limit will take precedence.
-8. Move charges at task migration
-=================================
+8. Move charges at task migration (DEPRECATED!)
+===============================================
+
+THIS IS DEPRECATED!
+
+It's expensive and unreliable! It's better practice to launch workload
+tasks directly from inside their target cgroup. Use dedicated workload
+cgroups to allow fine-grained policy adjustments without having to
+move physical pages between control domains.
Users can move charges associated with a task along with task migration, that
is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 4f1479a11e63..ff335e96e0d8 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -205,6 +205,15 @@ The end physical address of memory region that DAMON_RECLAIM will do work
against. That is, DAMON_RECLAIM will find cold memory regions in this region
and reclaims. By default, biggest System RAM is used as the region.
+skip_anon
+---------
+
+Skip anonymous pages reclamation.
+
+If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+pages. By default, ``N``.
+
+
kdamond_pid
-----------
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 1a5b6b71efa1..3d82ca6a17ff 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -87,6 +87,8 @@ comma (","). ::
│ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
│ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
+ │ │ │ │ │ │ │ filters/nr_filters
+ │ │ │ │ │ │ │ │ 0/type,matching,memcg_id
│ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
│ │ │ │ │ │ │ tried_regions/
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
@@ -151,6 +153,8 @@ number (``N``) to the file creates the number of child directories named as
moment, only one context per kdamond is supported, so only ``0`` or ``1`` can
be written to the file.
+.. _sysfs_contexts:
+
contexts/<N>/
-------------
@@ -268,8 +272,8 @@ schemes/<N>/
------------
In each scheme directory, five directories (``access_pattern``, ``quotas``,
-``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``)
-exist.
+``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file
+(``action``) exist.
The ``action`` file is for setting and getting what action you want to apply to
memory regions having specific access pattern of the interest. The keywords
@@ -347,6 +351,46 @@ as below.
The ``interval`` should written in microseconds unit.
+schemes/<N>/filters/
+--------------------
+
+Users could know something more than the kernel for specific types of memory.
+In the case, users could do their own management for the memory and hence
+doesn't want DAMOS bothers that. Users could limit DAMOS by setting the access
+pattern of the scheme and/or the monitoring regions for the purpose, but that
+can be inefficient in some cases. In such cases, users could set non-access
+pattern driven filters using files in this directory.
+
+In the beginning, this directory has only one file, ``nr_filters``. Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``. Each directory represents each filter. The filters are evaluated
+in the numeric order.
+
+Each filter directory contains three files, namely ``type``, ``matcing``, and
+``memcg_path``. You can write one of two special keywords, ``anon`` for
+anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of
+the memory cgroup filtering, you can specify the memory cgroup of the interest
+by writing the path of the memory cgroup from the cgroups mount point to
+``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to
+filter out pages that does or does not match to the type, respectively. Then,
+the scheme's action will not be applied to the pages that specified to be
+filtered out.
+
+For example, below restricts a DAMOS action to be applied to only non-anonymous
+pages of all memory cgroups except ``/having_care_already``.::
+
+ # echo 2 > nr_filters
+ # # filter out anonymous pages
+ echo anon > 0/type
+ echo Y > 0/matching
+ # # further filter out all cgroups except one at '/having_care_already'
+ echo memcg > 1/type
+ echo /having_care_already > 1/memcg_path
+ echo N > 1/matching
+
+Note that filters could be ignored depend on the running DAMON operations set
+`implementation <sysfs_contexts>`.
+
.. _sysfs_schemes_stats:
schemes/<N>/stats/
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 19f27c0d92e0..a969a2c742b2 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -461,13 +461,13 @@ Examples
.. _map_hugetlb:
``map_hugetlb``
- see tools/testing/selftests/vm/map_hugetlb.c
+ see tools/testing/selftests/mm/map_hugetlb.c
``hugepage-shm``
- see tools/testing/selftests/vm/hugepage-shm.c
+ see tools/testing/selftests/mm/hugepage-shm.c
``hugepage-mmap``
- see tools/testing/selftests/vm/hugepage-mmap.c
+ see tools/testing/selftests/mm/hugepage-mmap.c
The `libhugetlbfs`_ library provides a wide range of userspace tools
to help with huge page usability, environment setup, and control.
diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index df9394fb39c2..19492064278c 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -65,7 +65,7 @@ workload one should:
are not reclaimable, he or she can filter them out using
``/proc/kpageflags``.
-The page-types tool in the tools/vm directory can be used to assist in this.
+The page-types tool in the tools/mm directory can be used to assist in this.
If the tool is run initially with the appropriate option, it will mark all the
queried pages as idle. Subsequent runs of the tool can then show which pages have
their idle flag cleared in the interim.
diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst
index 166697325947..544a6d16c801 100644
--- a/Documentation/admin-guide/mm/numaperf.rst
+++ b/Documentation/admin-guide/mm/numaperf.rst
@@ -1,6 +1,9 @@
.. _numaperf:
-=============
+=======================
+NUMA Memory Performance
+=======================
+
NUMA Locality
=============
@@ -61,7 +64,6 @@ that are CPUs and hence suitable for generic task scheduling, and
IO initiators such as GPUs and NICs. Unlike access class 0, only
nodes containing CPUs are considered.
-================
NUMA Performance
================
@@ -96,7 +98,6 @@ for the platform.
Access class 1 takes the same form but only includes values for CPU to
memory activity.
-==========
NUMA Cache
==========
@@ -170,7 +171,6 @@ The "size" is the number of bytes provided by this cache level.
The "write_policy" will be 0 for write-back, and non-zero for
write-through caching.
-========
See Also
========
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 6e2e416af783..ceb5da3172ba 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -46,7 +46,7 @@ There are four components to pagemap:
* ``/proc/kpagecount``. This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
-The page-types tool in the tools/vm directory can be used to query the
+The page-types tool in the tools/mm directory can be used to query the
number of times a page is mapped.
* ``/proc/kpageflags``. This file contains a 64-bit set of flags for each
@@ -173,7 +173,7 @@ LRU related page flags
14 - SWAPBACKED
The page is backed by swap/RAM.
-The page-types tool in the tools/vm directory can be used to query the
+The page-types tool in the tools/mm directory can be used to query the
above flags.
Using pagemap to do something useful
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst
index b18416f4500f..facafbdecb95 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -221,7 +221,7 @@ Unit testing
============
This file::
- tools/testing/selftests/vm/gup_test.c
+ tools/testing/selftests/mm/gup_test.c
has the following new calls to exercise the new pin*() wrapper functions:
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 5c93ab915049..e66916a483cd 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -140,6 +140,23 @@ disabling KASAN altogether or controlling its features:
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
allocations (default: ``on``).
+- ``kasan.page_alloc.sample=<sampling interval>`` makes KASAN tag only every
+ Nth page_alloc allocation with the order equal or greater than
+ ``kasan.page_alloc.sample.order``, where N is the value of the ``sample``
+ parameter (default: ``1``, or tag every such allocation).
+ This parameter is intended to mitigate the performance overhead introduced
+ by KASAN.
+ Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks
+ of allocations chosen by sampling and thus miss bad accesses to these
+ allocations. Use the default value for accurate bug detection.
+
+- ``kasan.page_alloc.sample.order=<minimum page order>`` specifies the minimum
+ order of allocations that are affected by sampling (default: ``3``).
+ Only applies when ``kasan.page_alloc.sample`` is set to a value greater
+ than ``1``.
+ This parameter is intended to allow sampling only large page_alloc
+ allocations, which is the biggest source of the performance overhead.
+
Error reports
~~~~~~~~~~~~~
diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
index d7062c6a8946..d8f721f98868 100644
--- a/Documentation/mm/multigen_lru.rst
+++ b/Documentation/mm/multigen_lru.rst
@@ -89,15 +89,15 @@ variables are monotonically increasing.
Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
bits in order to fit into the gen counter in ``folio->flags``. Each
-truncated generation number is an index to ``lrugen->lists[]``. The
+truncated generation number is an index to ``lrugen->folios[]``. The
sliding window technique is used to track at least ``MIN_NR_GENS`` and
at most ``MAX_NR_GENS`` generations. The gen counter stores a value
within ``[1, MAX_NR_GENS]`` while a page is on one of
-``lrugen->lists[]``; otherwise it stores zero.
+``lrugen->folios[]``; otherwise it stores zero.
Each generation is divided into multiple tiers. A page accessed ``N``
times through file descriptors is in tier ``order_base_2(N)``. Unlike
-generations, tiers do not have dedicated ``lrugen->lists[]``. In
+generations, tiers do not have dedicated ``lrugen->folios[]``. In
contrast to moving across generations, which requires the LRU lock,
moving across tiers only involves atomic operations on
``folio->flags`` and therefore has a negligible cost. A feedback loop
@@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
Eviction
--------
The eviction consumes old generations. Given an ``lruvec``, it
-increments ``min_seq`` when ``lrugen->lists[]`` indexed by
+increments ``min_seq`` when ``lrugen->folios[]`` indexed by
``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
evict from, it first compares ``min_seq[]`` to select the older type.
If both types are equally old, it selects the one whose first tier has
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index 127514955a5e..5df26c0a0c1f 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -61,7 +61,7 @@ Usage
1) Build user-space helper::
- cd tools/vm
+ cd tools/mm
make page_owner_sort
2) Enable page owner: add "page_owner=on" to boot cmdline.
diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst
index 7f652216dabe..3ffa7eded251 100644
--- a/Documentation/mm/slub.rst
+++ b/Documentation/mm/slub.rst
@@ -21,7 +21,7 @@ slabs that have data in them. See "slabinfo -h" for more options when
running the command. ``slabinfo`` can be compiled with
::
- gcc -o slabinfo tools/vm/slabinfo.c
+ gcc -o slabinfo tools/mm/slabinfo.c
Some of the modes of operation of ``slabinfo`` require that slub debugging
be enabled on the command line. F.e. no tracking information will be
diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst
index 21a6a0837d42..4d3b2c33e4ef 100644
--- a/Documentation/translations/zh_CN/mm/page_owner.rst
+++ b/Documentation/translations/zh_CN/mm/page_owner.rst
@@ -62,7 +62,7 @@ page owner在默认情况下是ç¦ç”¨çš„。所以,如果你想使用它,你é
1) 构建用户空间的帮助::
- cd tools/vm
+ cd tools/mm
make page_owner_sort
2) å¯ç”¨page owner: 添加 "page_owner=on" 到 boot cmdline.
diff --git a/MAINTAINERS b/MAINTAINERS
index 123216b76534..8ac1472bea34 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9466,7 +9466,7 @@ F: Documentation/mm/hmm.rst
F: include/linux/hmm*
F: lib/test_hmm*
F: mm/hmm*
-F: tools/testing/selftests/vm/*hmm*
+F: tools/testing/selftests/mm/*hmm*
HOST AP DRIVER
M: Jouni Malinen <j@w1.fi>
@@ -13474,7 +13474,7 @@ M: Andrew Morton <akpm@linux-foundation.org>
L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org
-T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new
F: include/linux/gfp.h
F: include/linux/gfp_types.h
@@ -13483,7 +13483,8 @@ F: include/linux/mm.h
F: include/linux/mmzone.h
F: include/linux/pagewalk.h
F: mm/
-F: tools/testing/selftests/vm/
+F: tools/mm/
+F: tools/testing/selftests/mm/
VMALLOC
M: Andrew Morton <akpm@linux-foundation.org>
@@ -13492,7 +13493,7 @@ R: Christoph Hellwig <hch@infradead.org>
L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org
-T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
F: include/linux/vmalloc.h
F: mm/vmalloc.c
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index e59a32aa0c49..0119dc91abb5 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -138,13 +138,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
#ifdef CONFIG_COMPAT_VDSO
if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
#endif
}
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 507f8228f983..7a2ff9010f17 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -120,10 +120,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (vma_is_special_mapping(vma, &vvar_spec))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
}
mmap_read_unlock(mm);
diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c
index eb5bed333750..9580e8e12165 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -414,7 +414,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf)
/*
* When the LPAR lost credits due to core removal or during
* migration, invalidate the existing mapping for the current
- * paste addresses and set windows in-active (zap_page_range in
+ * paste addresses and set windows in-active (zap_vma_pages in
* reconfig_close_windows()).
* New mapping will be done later after migration or new credits
* available. So continue to receive faults if the user space
diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c
index 4ad6e510d405..559112312810 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -760,8 +760,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds,
* is done before the original mmap() and after the ioctl.
*/
if (vma)
- zap_page_range(vma, vma->vm_start,
- vma->vm_end - vma->vm_start);
+ zap_vma_pages(vma);
mmap_write_unlock(task_ref->mm);
mutex_unlock(&task_ref->mmap_mutex);
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index e410275918ac..5c30212d8d1c 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -124,13 +124,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (vma_is_special_mapping(vma, vdso_info.dm))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
#ifdef CONFIG_COMPAT
if (vma_is_special_mapping(vma, compat_vdso_info.dm))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
#endif
}
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index ff7bf4432229..bbaefd84f15e 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -59,11 +59,9 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (!vma_is_special_mapping(vma, &vvar_mapping))
continue;
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
break;
}
mmap_read_unlock(mm);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 74e1d873dce0..69af6cdf1a2a 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -722,7 +722,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
if (is_vm_hugetlb_page(vma))
continue;
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
- zap_page_range(vma, vmaddr, size);
+ zap_page_range_single(vma, vmaddr, size, NULL);
}
mmap_read_unlock(gmap->mm);
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f3f9b9e53c..ec5e4d2048cb 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -113,10 +113,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (vma_is_special_mapping(vma, &vvar_mapping))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
}
mmap_read_unlock(mm);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0564edd24ffb..1c843395a8b3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -289,6 +289,11 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_RW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
@@ -313,7 +318,7 @@ static inline int pte_uffd_wp(pte_t pte)
static inline pte_t pte_mkuffd_wp(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_UFFD_WP);
+ return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}
static inline pte_t pte_clear_uffd_wp(pte_t pte)
@@ -332,11 +337,6 @@ static inline pte_t pte_mkold(pte_t pte)
return pte_clear_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_wrprotect(pte_t pte)
-{
- return pte_clear_flags(pte, _PAGE_RW);
-}
-
static inline pte_t pte_mkexec(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_NX);
@@ -401,6 +401,11 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
@@ -409,7 +414,7 @@ static inline int pmd_uffd_wp(pmd_t pmd)
static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+ return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}
static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
@@ -428,11 +433,6 @@ static inline pmd_t pmd_mkclean(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_DIRTY);
}
-static inline pmd_t pmd_wrprotect(pmd_t pmd)
-{
- return pmd_clear_flags(pmd, _PAGE_RW);
-}
-
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 4ad42b0f75cd..55a3c3c2409f 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1019,7 +1019,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
if (vma) {
trace_binder_unmap_user_start(alloc, index);
- zap_page_range(vma, page_addr, PAGE_SIZE);
+ zap_page_range_single(vma, page_addr, PAGE_SIZE, NULL);
trace_binder_unmap_user_end(alloc, index);
}
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e290d6d97047..5d1088a645e3 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -190,7 +190,7 @@ static inline bool valid_io_request(struct zram *zram,
end = start + (size >> SECTOR_SHIFT);
bound = zram->disksize >> SECTOR_SHIFT;
- /* out of range range */
+ /* out of range */
if (unlikely(start >= bound || end > bound || start > end))
return false;
@@ -1140,7 +1140,7 @@ static ssize_t recomp_algorithm_store(struct device *dev,
while (*args) {
args = next_arg(args, &param, &val);
- if (!*val)
+ if (!val || !*val)
return -EINVAL;
if (!strcmp(param, "algo")) {
@@ -1824,7 +1824,7 @@ static ssize_t recompress_store(struct device *dev,
while (*args) {
args = next_arg(args, &param, &val);
- if (!*val)
+ if (!val || !*val)
return -EINVAL;
if (!strcmp(param, "type")) {
@@ -2385,7 +2385,7 @@ static int zram_add(void)
zram->disk->private_data = zram;
snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
- /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
+ /* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
set_capacity(zram->disk, 0);
/* zram devices sort of resembles non-rotational disks */
blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 83bf2a4dcb57..ffb101d349f0 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -343,7 +343,7 @@ static unsigned zero_mmap_capabilities(struct file *file)
/* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_struct *vma)
{
- return vma->vm_flags & VM_MAYSHARE;
+ return is_nommu_shared_mapping(vma->vm_flags);
}
#else
diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c
index c61be3404c6f..9dda47b3fd70 100644
--- a/drivers/misc/open-dice.c
+++ b/drivers/misc/open-dice.c
@@ -90,15 +90,13 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct open_dice_drvdata *drvdata = to_open_dice_drvdata(filp);
- /* Do not allow userspace to modify the underlying data. */
- if ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))
- return -EPERM;
-
- /* Ensure userspace cannot acquire VM_WRITE + VM_SHARED later. */
- if (vma->vm_flags & VM_WRITE)
- vma->vm_flags &= ~VM_MAYSHARE;
- else if (vma->vm_flags & VM_SHARED)
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /* Do not allow userspace to modify the underlying data. */
+ if (vma->vm_flags & VM_WRITE)
+ return -EPERM;
+ /* Ensure userspace cannot acquire VM_WRITE later. */
vma->vm_flags &= ~VM_MAYWRITE;
+ }
/* Create write-combine mapping so all clients observe a wipe. */
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
diff --git a/fs/buffer.c b/fs/buffer.c
index d9c6d1fbb6dd..7e42d67bcaad 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -60,7 +60,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
inline void touch_buffer(struct buffer_head *bh)
{
trace_block_touch_buffer(bh);
- mark_page_accessed(bh->b_page);
+ folio_mark_accessed(bh->b_folio);
}
EXPORT_SYMBOL(touch_buffer);
@@ -246,18 +246,18 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
- struct page *page;
- int page_uptodate = 1;
+ struct folio *folio;
+ int folio_uptodate = 1;
BUG_ON(!buffer_async_read(bh));
- page = bh->b_page;
+ folio = bh->b_folio;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
buffer_io_error(bh, ", async page read");
- SetPageError(page);
+ folio_set_error(folio);
}
/*
@@ -265,14 +265,14 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
* two buffer heads end IO at almost the same time and both
* decide that the page is now completely done.
*/
- first = page_buffers(page);
+ first = folio_buffers(folio);
spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
do {
if (!buffer_uptodate(tmp))
- page_uptodate = 0;
+ folio_uptodate = 0;
if (buffer_async_read(tmp)) {
BUG_ON(!buffer_locked(tmp));
goto still_busy;
@@ -285,9 +285,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
* If all of the buffers are uptodate then we can set the page
* uptodate.
*/
- if (page_uptodate)
- SetPageUptodate(page);
- unlock_page(page);
+ if (folio_uptodate)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return;
still_busy:
@@ -321,7 +321,7 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
/* Decrypt if needed */
if (uptodate &&
- fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
+ fscrypt_inode_uses_fs_layer_crypto(bh->b_folio->mapping->host)) {
struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx) {
@@ -344,21 +344,21 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
- struct page *page;
+ struct folio *folio;
BUG_ON(!buffer_async_write(bh));
- page = bh->b_page;
+ folio = bh->b_folio;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
- SetPageError(page);
+ folio_set_error(folio);
}
- first = page_buffers(page);
+ first = folio_buffers(folio);
spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_write(bh);
@@ -372,7 +372,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
tmp = tmp->b_this_page;
}
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- end_page_writeback(page);
+ folio_end_writeback(folio);
return;
still_busy:
@@ -570,7 +570,7 @@ void write_boundary_block(struct block_device *bdev,
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
- struct address_space *buffer_mapping = bh->b_page->mapping;
+ struct address_space *buffer_mapping = bh->b_folio->mapping;
mark_buffer_dirty(bh);
if (!mapping->private_data) {
@@ -1073,7 +1073,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* and then attach the address_space's inode to its superblock's dirty
* inode list.
*
- * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
+ * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
* i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
@@ -1095,16 +1095,16 @@ void mark_buffer_dirty(struct buffer_head *bh)
}
if (!test_set_buffer_dirty(bh)) {
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
struct address_space *mapping = NULL;
- lock_page_memcg(page);
- if (!TestSetPageDirty(page)) {
- mapping = page_mapping(page);
+ folio_memcg_lock(folio);
+ if (!folio_test_set_dirty(folio)) {
+ mapping = folio->mapping;
if (mapping)
- __set_page_dirty(page, mapping, 0);
+ __folio_mark_dirty(folio, mapping, 0);
}
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
@@ -1117,8 +1117,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
set_buffer_write_io_error(bh);
/* FIXME: do we need to set this in both places? */
- if (bh->b_page && bh->b_page->mapping)
- mapping_set_error(bh->b_page->mapping, -EIO);
+ if (bh->b_folio && bh->b_folio->mapping)
+ mapping_set_error(bh->b_folio->mapping, -EIO);
if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
rcu_read_lock();
@@ -1154,7 +1154,7 @@ void __bforget(struct buffer_head *bh)
{
clear_buffer_dirty(bh);
if (bh->b_assoc_map) {
- struct address_space *buffer_mapping = bh->b_page->mapping;
+ struct address_space *buffer_mapping = bh->b_folio->mapping;
spin_lock(&buffer_mapping->private_lock);
list_del_init(&bh->b_assoc_buffers);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 61ccf7722fc3..50e4e060db68 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -437,7 +437,7 @@ bailout:
static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+ return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
}
static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 8dbb87edf24c..2de9829aed63 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -110,22 +110,23 @@ out:
}
/**
- * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
- * @index1: page index
- * @index2: page index
- * @page: result page vector
+ * @index1: folio index
+ * @index2: folio index
+ * @folio: result folio vector
*
- * Grab two locked pages for inode's by inode order
+ * Grab two locked folio for inode's by inode order
*/
static int
-mext_page_double_lock(struct inode *inode1, struct inode *inode2,
- pgoff_t index1, pgoff_t index2, struct page *page[2])
+mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index1, pgoff_t index2, struct folio *folio[2])
{
struct address_space *mapping[2];
unsigned int flags;
+ unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
@@ -138,28 +139,30 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
}
flags = memalloc_nofs_save();
- page[0] = grab_cache_page_write_begin(mapping[0], index1);
- if (!page[0]) {
+ folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
+ mapping_gfp_mask(mapping[0]));
+ if (!folio[0]) {
memalloc_nofs_restore(flags);
return -ENOMEM;
}
- page[1] = grab_cache_page_write_begin(mapping[1], index2);
+ folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
+ mapping_gfp_mask(mapping[1]));
memalloc_nofs_restore(flags);
- if (!page[1]) {
- unlock_page(page[0]);
- put_page(page[0]);
+ if (!folio[1]) {
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
return -ENOMEM;
}
/*
- * grab_cache_page_write_begin() may not wait on page's writeback if
+ * __filemap_get_folio() may not wait on folio's writeback if
* BDI not demand that. But it is reasonable to be very conservative
- * here and explicitly wait on page's writeback
+ * here and explicitly wait on folio's writeback
*/
- wait_on_page_writeback(page[0]);
- wait_on_page_writeback(page[1]);
+ folio_wait_writeback(folio[0]);
+ folio_wait_writeback(folio[1]);
if (inode1 > inode2)
- swap(page[0], page[1]);
+ swap(folio[0], folio[1]);
return 0;
}
@@ -252,7 +255,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
int block_len_in_page, int unwritten, int *err)
{
struct inode *orig_inode = file_inode(o_filp);
- struct page *pagep[2] = {NULL, NULL};
struct folio *folio[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset, donor_blk_offset;
@@ -303,8 +305,8 @@ again:
replaced_size = data_size;
- *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
- donor_page_offset, pagep);
+ *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset,
+ donor_page_offset, folio);
if (unlikely(*err < 0))
goto stop_journal;
/*
@@ -314,8 +316,6 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
- folio[0] = page_folio(pagep[0]);
- folio[1] = page_folio(pagep[1]);
VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d78b61ecc1cd..081422644ec5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -39,7 +39,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
"AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
"state 0x%lx\n",
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
- bh->b_page->mapping, bh->b_page->flags);
+ bh->b_folio->mapping, bh->b_folio->flags);
fs_err(sdp, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 723639376ae2..1fcc829f02ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -127,7 +127,7 @@ __acquires(&sdp->sd_ail_lock)
continue;
gl = bd->bd_gl;
list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list);
- mapping = bh->b_page->mapping;
+ mapping = bh->b_folio->mapping;
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3c41b864ee5b..924361fa510b 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -334,7 +334,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
{
- struct address_space *mapping = bh->b_page->mapping;
+ struct address_space *mapping = bh->b_folio->mapping;
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
struct gfs2_bufdata *bd = bh->b_private;
struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 790d2727141a..48f1a8ad2243 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
{
pte_t *ptep, pte;
- ptep = huge_pte_offset(vma->vm_mm, addr,
- huge_page_size(hstate_vma(vma)));
-
+ ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
if (!ptep)
return false;
@@ -412,10 +410,12 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
*/
static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
{
+ unsigned long offset = 0;
+
if (vma->vm_pgoff < start)
- return (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- return 0;
+ offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+
+ return vma->vm_start + offset;
}
static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
@@ -457,7 +457,7 @@ retry:
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ if (!hugetlb_vma_maps_page(vma, v_start, page))
continue;
if (!hugetlb_vma_trylock_write(vma)) {
@@ -473,8 +473,8 @@ retry:
break;
}
- unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
- NULL, ZAP_FLAG_DROP_MARKER);
+ unmap_hugepage_range(vma, v_start, v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
hugetlb_vma_unlock_write(vma);
}
@@ -507,10 +507,9 @@ retry:
*/
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
- unmap_hugepage_range(vma, vma->vm_start + v_start,
- v_end, NULL,
- ZAP_FLAG_DROP_MARKER);
+ if (hugetlb_vma_maps_page(vma, v_start, page))
+ unmap_hugepage_range(vma, v_start, v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
hugetlb_vma_unlock_write(vma);
@@ -540,8 +539,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
- NULL, zap_flags);
+ unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);
/*
* Note that vma lock only exists for shared/non-private
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4810438b7856..b33155dd7001 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -63,16 +63,12 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
static void release_buffer_page(struct buffer_head *bh)
{
struct folio *folio;
- struct page *page;
if (buffer_dirty(bh))
goto nope;
if (atomic_read(&bh->b_count) != 1)
goto nope;
- page = bh->b_page;
- if (!page)
- goto nope;
- folio = page_folio(page);
+ folio = bh->b_folio;
if (folio->mapping)
goto nope;
@@ -181,31 +177,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
return ret;
}
-/*
- * write the filemap data using writepage() address_space_operations.
- * We don't do block allocation here even for delalloc. We don't
- * use writepages() because with delayed allocation we may be doing
- * block allocation in writepages().
- */
-int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
-{
- struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = jinode->i_dirty_start,
- .range_end = jinode->i_dirty_end,
- };
-
- /*
- * submit the inode data buffers. We use writepage
- * instead of writepages. Because writepages can do
- * block allocation with delalloc. We need to write
- * only allocated blocks here.
- */
- return generic_writepages(mapping, &wbc);
-}
-
/* Send all the data buffers related to an inode */
int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
{
@@ -1040,7 +1011,7 @@ restart_loop:
* already detached from the mapping and buffers cannot
* get reused.
*/
- mapping = READ_ONCE(bh->b_page->mapping);
+ mapping = READ_ONCE(bh->b_folio->mapping);
if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
clear_buffer_mapped(bh);
clear_buffer_new(bh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2696f43e7239..e80c781731f8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -89,7 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
-EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
@@ -2938,7 +2937,7 @@ repeat:
} else {
J_ASSERT_BH(bh,
(atomic_read(&bh->b_count) > 0) ||
- (bh->b_page && bh->b_page->mapping));
+ (bh->b_folio && bh->b_folio->mapping));
if (!new_jh) {
jbd_unlock_bh_journal_head(bh);
diff --git a/fs/mpage.c b/fs/mpage.c
index 0f8ae954a579..b8e7975159bc 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -198,7 +198,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
/*
* Then do more get_blocks calls until we are done with this folio.
*/
- map_bh->b_page = &folio->page;
+ map_bh->b_folio = folio;
while (page_block < blocks_per_page) {
map_bh->b_state = 0;
map_bh->b_size = 0;
@@ -524,6 +524,12 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
*/
BUG_ON(!PageUptodate(page));
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ /*
+ * Whole page beyond EOF? Skip allocating blocks to avoid leaking
+ * space.
+ */
+ if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits)
+ goto page_is_mapped;
last_block = (i_size - 1) >> blkbits;
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {
@@ -641,14 +647,6 @@ out:
*
* This is a library function, which implements the writepages()
* address_space_operation.
- *
- * If a page is already under I/O, generic_writepages() skips it, even
- * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
- * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
- * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them. If wbc->sync_mode is
- * WB_SYNC_ALL then we were called for data integrity and we must wait for
- * existing IO to complete.
*/
int
mpage_writepages(struct address_space *mapping,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e74fda212620..e956f886a1a1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -188,7 +188,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
struct page *opage = obh->b_page;
lock_page(opage);
retry:
- /* BUG_ON(oldkey != obh->b_page->index); */
+ /* BUG_ON(oldkey != obh->b_folio->index); */
if (unlikely(oldkey != opage->index))
NILFS_PAGE_BUG(opage,
"invalid oldkey %lld (newkey=%lld)",
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 40ce92a332fe..b5f997e5e670 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -398,7 +398,7 @@ int nilfs_btree_broken_node_block(struct buffer_head *bh)
if (buffer_nilfs_checked(bh))
return 0;
- inode = bh->b_page->mapping->host;
+ inode = bh->b_folio->mapping->host;
ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
bh->b_size, inode, bh->b_blocknr);
if (likely(!ret))
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index b0d22ff24b67..48fe71d309cb 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -140,7 +140,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
{
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- struct inode *inode = bh->b_page->mapping->host;
+ struct inode *inode = bh->b_folio->mapping->host;
nilfs_err(inode->i_sb,
"I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index cbf4fa60eea2..19c8158605ed 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -563,7 +563,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
struct page *page;
int blkbits = inode->i_blkbits;
- page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index);
+ page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index);
if (!page)
return -ENOMEM;
@@ -595,7 +595,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
struct page *page;
int n;
- page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index);
+ page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index);
if (page) {
if (page_has_buffers(page)) {
n = bh_offset(bh) >> inode->i_blkbits;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 76c3bd88b858..f7a14ed12a66 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1581,7 +1581,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
nblocks = le32_to_cpu(finfo->fi_nblocks);
ndatablk = le32_to_cpu(finfo->fi_ndatablk);
- inode = bh->b_page->mapping->host;
+ inode = bh->b_folio->mapping->host;
if (mode == SC_LSEG_DSYNC)
sc_op = &nilfs_sc_dsync_ops;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 20b953871574..6b50b6e32378 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -832,32 +832,29 @@ out:
return err;
}
-static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
+static int ntfs_resident_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct ntfs_inode *ni = ntfs_i(inode);
- int err;
+ struct address_space *mapping = data;
+ struct ntfs_inode *ni = ntfs_i(mapping->host);
+ int ret;
- if (is_resident(ni)) {
- ni_lock(ni);
- err = attr_data_write_resident(ni, page);
- ni_unlock(ni);
- if (err != E_NTFS_NONRESIDENT) {
- unlock_page(page);
- return err;
- }
- }
+ ni_lock(ni);
+ ret = attr_data_write_resident(ni, page);
+ ni_unlock(ni);
- return block_write_full_page(page, ntfs_get_block, wbc);
+ if (ret != E_NTFS_NONRESIDENT)
+ unlock_page(page);
+ mapping_set_error(mapping, ret);
+ return ret;
}
static int ntfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- /* Redirect call to 'ntfs_writepage' for resident files. */
if (is_resident(ntfs_i(mapping->host)))
- return generic_writepages(mapping, wbc);
+ return write_cache_pages(mapping, wbc, ntfs_resident_writepage,
+ mapping);
return mpage_writepages(mapping, wbc, ntfs_get_block);
}
@@ -2066,13 +2063,13 @@ const struct inode_operations ntfs_link_inode_operations = {
const struct address_space_operations ntfs_aops = {
.read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
- .writepage = ntfs_writepage,
.writepages = ntfs_writepages,
.write_begin = ntfs_write_begin,
.write_end = ntfs_write_end,
.direct_IO = ntfs_direct_IO,
.bmap = ntfs_bmap,
.dirty_folio = block_dirty_folio,
+ .migrate_folio = buffer_migrate_folio,
.invalidate_folio = block_invalidate_folio,
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3fb98b4569a2..25d8072ccfce 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -15,6 +15,7 @@
#include <linux/time.h>
#include <linux/random.h>
#include <linux/delay.h>
+#include <linux/writeback.h>
#include <cluster/masklog.h>
@@ -841,6 +842,19 @@ bail:
return status;
}
+static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+
+ return filemap_fdatawrite_wbc(mapping, &wbc);
+}
+
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
{
int status = -1;
@@ -910,7 +924,7 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
journal->j_journal = j_journal;
journal->j_journal->j_submit_inode_data_buffers =
- jbd2_journal_submit_inode_data_buffers;
+ ocfs2_journal_submit_inode_data_buffers;
journal->j_journal->j_finish_inode_data_buffers =
jbd2_journal_finish_inode_data_buffers;
journal->j_inode = inode;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 2fd06f52b6a4..0ec35072a8e5 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -38,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
}
if (atomic_read(&mm->mm_count) > 1 ||
- vma->vm_flags & VM_MAYSHARE) {
+ is_nommu_shared_mapping(vma->vm_flags)) {
sbytes += size;
} else {
bytes += size;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index cb240eac5036..cd4537692751 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -264,7 +264,7 @@ out:
*/
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
- if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)))
+ if (!is_nommu_shared_mapping(vma->vm_flags))
return -ENOSYS;
file_accessed(file);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9f62da7471c9..9ce4ec296b74 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -601,7 +601,7 @@ static int journal_list_still_alive(struct super_block *s,
*/
static void release_buffer_page(struct buffer_head *bh)
{
- struct folio *folio = page_folio(bh->b_page);
+ struct folio *folio = bh->b_folio;
if (!folio->mapping && folio_trylock(folio)) {
folio_get(folio);
put_bh(bh);
@@ -866,7 +866,7 @@ loop_next:
* will ever write the buffer. We're safe if we write the
* page one last time after freeing the journal header.
*/
- if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
+ if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
spin_unlock(lock);
write_dirty_buffer(bh, 0);
spin_lock(lock);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index b0ae088dffc7..2cec61af2a9e 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -177,7 +177,7 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
* BUG() on attempt to write not mapped buffer
*/
if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
- struct inode *inode = bh->b_page->mapping->host;
+ struct inode *inode = bh->b_folio->mapping->host;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
spin_lock(&j->j_dirty_buffers_lock);
list_del_init(&bh->b_assoc_buffers);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 2c4a23113fb5..4578dc45e50a 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
*/
static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
{
- return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+ return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
}
static unsigned romfs_mmap_capabilities(struct file *file)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cc694846617a..15a5bf765d43 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -252,14 +252,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
unsigned long flags,
unsigned long reason)
{
- struct mm_struct *mm = ctx->mm;
pte_t *ptep, pte;
bool ret = true;
- mmap_assert_locked(mm);
-
- ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
+ mmap_assert_locked(ctx->mm);
+ ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
@@ -391,7 +389,8 @@ static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
*/
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
- struct mm_struct *mm = vmf->vma->vm_mm;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -418,7 +417,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*/
mmap_assert_locked(mm);
- ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -508,6 +507,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
blocking_state = userfaultfd_get_blocking_state(vmf->flags);
+ /*
+ * Take the vma lock now, in order to safely call
+ * userfaultfd_huge_must_wait() later. Since acquiring the
+ * (sleepable) vma lock can modify the current task state, that
+ * must be before explicitly calling set_current_state().
+ */
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_lock_read(vma);
+
spin_lock_irq(&ctx->fault_pending_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
@@ -522,13 +530,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
- if (!is_vm_hugetlb_page(vmf->vma))
+ if (!is_vm_hugetlb_page(vma))
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
+ must_wait = userfaultfd_huge_must_wait(ctx, vma,
vmf->address,
vmf->flags, reason);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_unlock_read(vma);
mmap_read_unlock(mm);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index a57d667addd2..d7f6335d3999 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -25,6 +25,13 @@ static inline pte_t huge_pte_mkwrite(pte_t pte)
return pte_mkwrite(pte);
}
+#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
+static inline pte_t huge_pte_wrprotect(pte_t pte)
+{
+ return pte_wrprotect(pte);
+}
+#endif
+
static inline pte_t huge_pte_mkdirty(pte_t pte)
{
return pte_mkdirty(pte);
@@ -37,7 +44,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
{
- return pte_mkuffd_wp(pte);
+ return huge_pte_wrprotect(pte_mkuffd_wp(pte));
}
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
@@ -104,13 +111,6 @@ static inline int huge_pte_none_mostly(pte_t pte)
return huge_pte_none(pte) || is_pte_marker(pte);
}
-#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-#endif
-
#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 33fa5e94aa80..8f14dca5fed7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -61,7 +61,10 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
struct buffer_head {
unsigned long b_state; /* buffer state bitmap (see above) */
struct buffer_head *b_this_page;/* circular list of page's buffers */
- struct page *b_page; /* the page this bh is mapped to */
+ union {
+ struct page *b_page; /* the page this bh is mapped to */
+ struct folio *b_folio; /* the folio this bh is mapped to */
+ };
sector_t b_blocknr; /* start block number */
size_t b_size; /* size of mapping */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index ad15a5b88e3a..7907918ad2e0 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -8,6 +8,7 @@
#ifndef _DAMON_H_
#define _DAMON_H_
+#include <linux/memcontrol.h>
#include <linux/mutex.h>
#include <linux/time64.h>
#include <linux/types.h>
@@ -216,6 +217,39 @@ struct damos_stat {
};
/**
+ * enum damos_filter_type - Type of memory for &struct damos_filter
+ * @DAMOS_FILTER_TYPE_ANON: Anonymous pages.
+ * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages.
+ * @NR_DAMOS_FILTER_TYPES: Number of filter types.
+ */
+enum damos_filter_type {
+ DAMOS_FILTER_TYPE_ANON,
+ DAMOS_FILTER_TYPE_MEMCG,
+ NR_DAMOS_FILTER_TYPES,
+};
+
+/**
+ * struct damos_filter - DAMOS action target memory filter.
+ * @type: Type of the page.
+ * @matching: If the matching page should filtered out or in.
+ * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG.
+ * @list: List head for siblings.
+ *
+ * Before applying the &damos->action to a memory region, DAMOS checks if each
+ * page of the region matches to this and avoid applying the action if so.
+ * Note that the check support is up to &struct damon_operations
+ * implementation.
+ */
+struct damos_filter {
+ enum damos_filter_type type;
+ bool matching;
+ union {
+ unsigned short memcg_id;
+ };
+ struct list_head list;
+};
+
+/**
* struct damos_access_pattern - Target access pattern of the given scheme.
* @min_sz_region: Minimum size of target regions.
* @max_sz_region: Maximum size of target regions.
@@ -239,6 +273,7 @@ struct damos_access_pattern {
* @action: &damo_action to be applied to the target regions.
* @quota: Control the aggressiveness of this scheme.
* @wmarks: Watermarks for automated (in)activation of this scheme.
+ * @filters: Additional set of &struct damos_filter for &action.
* @stat: Statistics of this scheme.
* @list: List head for siblings.
*
@@ -254,6 +289,10 @@ struct damos_access_pattern {
* If all schemes that registered to a &struct damon_ctx are inactive, DAMON
* stops monitoring and just repeatedly checks the watermarks.
*
+ * Before applying the &action to a memory region, &struct damon_operations
+ * implementation could check pages of the region and skip &action to respect
+ * &filters
+ *
* After applying the &action to each region, &stat_count and &stat_sz is
* updated to reflect the number of regions and total size of regions that the
* &action is applied.
@@ -263,6 +302,7 @@ struct damos {
enum damos_action action;
struct damos_quota quota;
struct damos_watermarks wmarks;
+ struct list_head filters;
struct damos_stat stat;
struct list_head list;
};
@@ -516,6 +556,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#define damon_for_each_scheme_safe(s, next, ctx) \
list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
+#define damos_for_each_filter(f, scheme) \
+ list_for_each_entry(f, &(scheme)->filters, list)
+
+#define damos_for_each_filter_safe(f, next, scheme) \
+ list_for_each_entry_safe(f, next, &(scheme)->filters, list)
+
#ifdef CONFIG_DAMON
struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -536,6 +582,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
unsigned int nr_ranges);
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+ bool matching);
+void damos_add_filter(struct damos *s, struct damos_filter *f);
+void damos_destroy_filter(struct damos_filter *f);
+
struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
enum damos_action action, struct damos_quota *quota,
struct damos_watermarks *wmarks);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c1769a2c5d70..d353c262d669 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports DIRECT IO */
#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
+#define FMODE_NOREUSE ((__force fmode_t)0x800000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 44242268f53b..d7097b8158f2 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -86,8 +86,8 @@ static inline void kmap_flush_unused(void);
* virtual address of the direct mapping. Only real highmem pages are
* temporarily mapped.
*
- * While it is significantly faster than kmap() for the higmem case it
- * comes with restrictions about the pointer validity.
+ * While kmap_local_page() is significantly faster than kmap() for the highmem
+ * case it comes with restrictions about the pointer validity.
*
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
* disabling migration in order to keep the virtual address stable across
@@ -119,9 +119,8 @@ static inline void *kmap_local_page(struct page *page);
* virtual address of the direct mapping. Only real highmem pages are
* temporarily mapped.
*
- * While it is significantly faster than kmap() for the higmem case it
- * comes with restrictions about the pointer validity. Only use when really
- * necessary.
+ * While it is significantly faster than kmap() for the highmem case it
+ * comes with restrictions about the pointer validity.
*
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
* disabling migration in order to keep the virtual address stable across
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index db194e2ba69f..7d6413c3b8f5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H
+#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
@@ -193,6 +194,43 @@ extern struct list_head huge_boot_pages;
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz);
+/*
+ * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
+ * Returns the pte_t* if found, or NULL if the address is not mapped.
+ *
+ * IMPORTANT: we should normally not directly call this function, instead
+ * this is only a common interface to implement arch-specific
+ * walker. Please use hugetlb_walk() instead, because that will attempt to
+ * verify the locking for you.
+ *
+ * Since this function will walk all the pgtable pages (including not only
+ * high-level pgtable page, but also PUD entry that can be unshared
+ * concurrently for VM_SHARED), the caller of this function should be
+ * responsible of its thread safety. One can follow this rule:
+ *
+ * (1) For private mappings: pmd unsharing is not possible, so holding the
+ * mmap_lock for either read or write is sufficient. Most callers
+ * already hold the mmap_lock, so normally, no special action is
+ * required.
+ *
+ * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
+ * pgtable page can go away from under us! It can be done by a pmd
+ * unshare with a follow up munmap() on the other process), then we
+ * need either:
+ *
+ * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
+ * won't happen upon the range (it also makes sure the pte_t we
+ * read is the right and stable one), or,
+ *
+ * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
+ * sure even if unshare happened the racy unmap() will wait until
+ * i_mmap_rwsem is released.
+ *
+ * Option (2.1) is the safest, which guarantees pte stability from pmd
+ * sharing pov, until the vma lock released. Option (2.2) doesn't protect
+ * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
+ * access.
+ */
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
@@ -211,7 +249,7 @@ void hugetlb_vma_lock_release(struct kref *kref);
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
@@ -400,7 +438,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline unsigned long hugetlb_change_protection(
+static inline long hugetlb_change_protection(
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
@@ -1210,4 +1248,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
#endif
+static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
+}
+
+/*
+ * Safe version of huge_pte_offset() to check the locks. See comments
+ * above huge_pte_offset().
+ */
+static inline pte_t *
+hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
+{
+#if defined(CONFIG_HUGETLB_PAGE) && \
+ defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ /*
+ * If pmd sharing possible, locking needed to safely walk the
+ * hugetlb pgtables. More information can be found at the comment
+ * above huge_pte_offset() in the same file.
+ *
+ * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
+ */
+ if (__vma_shareable_lock(vma))
+ WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
+ !lockdep_is_held(
+ &vma->vm_file->f_mapping->i_mmap_rwsem));
+#endif
+ return huge_pte_offset(vma->vm_mm, addr, sz);
+}
+
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 2170e0cc279d..5962072a4b19 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1570,8 +1570,6 @@ extern int jbd2_journal_inode_ranged_write(handle_t *handle,
extern int jbd2_journal_inode_ranged_wait(handle_t *handle,
struct jbd2_inode *inode, loff_t start_byte,
loff_t length);
-extern int jbd2_journal_submit_inode_data_buffers(
- struct jbd2_inode *jinode);
extern int jbd2_journal_finish_inode_data_buffers(
struct jbd2_inode *jinode);
extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 96c9d56e5510..f7ef70661ce2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -96,15 +96,6 @@ static inline bool kasan_has_integrated_init(void)
}
#ifdef CONFIG_KASAN
-
-struct kasan_cache {
-#ifdef CONFIG_KASAN_GENERIC
- int alloc_meta_offset;
- int free_meta_offset;
-#endif
- bool is_kmalloc;
-};
-
void __kasan_unpoison_range(const void *addr, size_t size);
static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
{
@@ -120,19 +111,13 @@ static __always_inline void kasan_poison_pages(struct page *page,
__kasan_poison_pages(page, order, init);
}
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_unpoison_pages(struct page *page,
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline bool kasan_unpoison_pages(struct page *page,
unsigned int order, bool init)
{
if (kasan_enabled())
- __kasan_unpoison_pages(page, order, init);
-}
-
-void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
-static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
-{
- if (kasan_enabled())
- __kasan_cache_create_kmalloc(cache);
+ return __kasan_unpoison_pages(page, order, init);
+ return false;
}
void __kasan_poison_slab(struct slab *slab);
@@ -249,9 +234,11 @@ static __always_inline bool kasan_check_byte(const void *addr)
static inline void kasan_unpoison_range(const void *address, size_t size) {}
static inline void kasan_poison_pages(struct page *page, unsigned int order,
bool init) {}
-static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
- bool init) {}
-static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
+static inline bool kasan_unpoison_pages(struct page *page, unsigned int order,
+ bool init)
+{
+ return false;
+}
static inline void kasan_poison_slab(struct slab *slab) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
@@ -302,6 +289,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
#ifdef CONFIG_KASAN_GENERIC
+struct kasan_cache {
+ int alloc_meta_offset;
+ int free_meta_offset;
+};
+
size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object);
slab_flags_t kasan_never_merge(void);
void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index e594db58a0f1..815a27661517 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -12,7 +12,6 @@
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */
-/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
/*
* Allocated nodes are mutable until they have been inserted into the tree,
@@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas)
return mas->node == MAS_PAUSE;
}
-void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
-void mas_dup_store(struct ma_state *mas, void *entry);
-
/*
* This finds an empty area from the highest address to the lowest.
* AKA "Topdown" version,
@@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas)
* entry.
*
* Note: may return the zero entry.
- *
*/
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
@@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt)
}
static inline unsigned int mt_height(const struct maple_tree *mt)
-
{
return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 85dc9b88ea37..e605fc885f08 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -466,34 +466,34 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
}
/*
- * page_memcg_check - get the memory cgroup associated with a page
- * @page: a pointer to the page struct
+ * folio_memcg_check - Get the memory cgroup associated with a folio.
+ * @folio: Pointer to the folio.
*
- * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function unlike page_memcg() can take any page
- * as an argument. It has to be used in cases when it's not known if a page
+ * Returns a pointer to the memory cgroup associated with the folio,
+ * or NULL. This function unlike folio_memcg() can take any folio
+ * as an argument. It has to be used in cases when it's not known if a folio
* has an associated memory cgroup pointer or an object cgroups vector or
* an object cgroup.
*
- * For a non-kmem page any of the following ensures page and memcg binding
+ * For a non-kmem folio any of the following ensures folio and memcg binding
* stability:
*
- * - the page lock
+ * - the folio lock
* - LRU isolation
- * - lock_page_memcg()
+ * - lock_folio_memcg()
* - exclusive reference
* - mem_cgroup_trylock_pages()
*
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
+ * For a kmem folio a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem folio from being released.
*/
-static inline struct mem_cgroup *page_memcg_check(struct page *page)
+static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
/*
- * Because page->memcg_data might be changed asynchronously
- * for slab pages, READ_ONCE() should be used here.
+ * Because folio->memcg_data might be changed asynchronously
+ * for slabs, READ_ONCE() should be used here.
*/
- unsigned long memcg_data = READ_ONCE(page->memcg_data);
+ unsigned long memcg_data = READ_ONCE(folio->memcg_data);
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;
@@ -508,6 +508,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
+static inline struct mem_cgroup *page_memcg_check(struct page *page)
+{
+ if (PageTail(page))
+ return NULL;
+ return folio_memcg_check((struct folio *)page);
+}
+
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
struct mem_cgroup *memcg;
@@ -794,6 +801,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt);
}
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
+{
+ return !memcg || css_tryget(&memcg->css);
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
@@ -1165,6 +1177,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
return NULL;
}
+static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
return NULL;
@@ -1301,6 +1318,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
+{
+ return true;
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f857163ac89..76c97cb8ee9a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -276,7 +276,12 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#ifdef CONFIG_MMU
#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
+#else /* CONFIG_MMU */
+#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+#define VM_UFFD_MISSING 0
+#endif /* CONFIG_MMU */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
@@ -1019,22 +1024,6 @@ static inline void set_compound_order(struct page *page, unsigned int order)
#endif
}
-/*
- * folio_set_compound_order is generally passed a non-zero order to
- * initialize a large folio. However, hugetlb code abuses this by
- * passing in zero when 'dissolving' a large folio.
- */
-static inline void folio_set_compound_order(struct folio *folio,
- unsigned int order)
-{
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
-
- folio->_folio_order = order;
-#ifdef CONFIG_64BIT
- folio->_folio_nr_pages = order ? 1U << order : 0;
-#endif
-}
-
/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
@@ -1363,6 +1352,21 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+#ifndef CONFIG_MMU
+static inline bool is_nommu_shared_mapping(vm_flags_t flags)
+{
+ /*
+ * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
+ * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
+ * a file mapping. R/O MAP_PRIVATE mappings might still modify
+ * underlying memory if ptrace is active, so this is only possible if
+ * ptrace does not apply. Note that there is no mprotect() to upgrade
+ * write permissions later.
+ */
+ return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
+}
+#endif
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
@@ -1923,6 +1927,21 @@ static inline bool page_is_pfmemalloc(const struct page *page)
}
/*
+ * Return true only if the folio has been allocated with
+ * ALLOC_NO_WATERMARKS and the low watermark was not
+ * met implying that the system is under some pressure.
+ */
+static inline bool folio_is_pfmemalloc(const struct folio *folio)
+{
+ /*
+ * lru.next has bit 1 set if the page is allocated from the
+ * pfmemalloc reserves. Callers may simply overwrite it if
+ * they do not need to preserve that information.
+ */
+ return (uintptr_t)folio->lru.next & BIT(1);
+}
+
+/*
* Only to be called by the page allocator on a freshly allocated
* page.
*/
@@ -1984,6 +2003,8 @@ static inline bool can_do_mlock(void) { return false; }
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);
+struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1991,10 +2012,13 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
- unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details);
+static inline void zap_vma_pages(struct vm_area_struct *vma)
+{
+ zap_page_range_single(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, NULL);
+}
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
unsigned long end);
@@ -2146,10 +2170,9 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma
}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
-extern unsigned long change_protection(struct mmu_gather *tlb,
+extern long change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags);
+ unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -3071,81 +3094,6 @@ static inline vm_fault_t vmf_error(int err)
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags);
-#define FOLL_WRITE 0x01 /* check pte is writable */
-#define FOLL_TOUCH 0x02 /* mark page accessed */
-#define FOLL_GET 0x04 /* do get_page on page */
-#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
-#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
-#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
- * and return without waiting upon it */
-#define FOLL_NOFAULT 0x80 /* do not fault in pages */
-#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
-#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
-#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
-#define FOLL_ANON 0x8000 /* don't do file mappings */
-#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
-#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
-#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
-#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
-#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */
-#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */
-
-/*
- * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
- * other. Here is what they mean, and how to use them:
- *
- * FOLL_LONGTERM indicates that the page will be held for an indefinite time
- * period _often_ under userspace control. This is in contrast to
- * iov_iter_get_pages(), whose usages are transient.
- *
- * FIXME: For pages which are part of a filesystem, mappings are subject to the
- * lifetime enforced by the filesystem and we need guarantees that longterm
- * users like RDMA and V4L2 only establish mappings which coordinate usage with
- * the filesystem. Ideas for this coordination include revoking the longterm
- * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
- * added after the problem with filesystems was found FS DAX VMAs are
- * specifically failed. Filesystem pages are still subject to bugs and use of
- * FOLL_LONGTERM should be avoided on those pages.
- *
- * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
- * Currently only get_user_pages() and get_user_pages_fast() support this flag
- * and calls to get_user_pages_[un]locked are specifically not allowed. This
- * is due to an incompatibility with the FS DAX check and
- * FAULT_FLAG_ALLOW_RETRY.
- *
- * In the CMA case: long term pins in a CMA region would unnecessarily fragment
- * that region. And so, CMA attempts to migrate the page before pinning, when
- * FOLL_LONGTERM is specified.
- *
- * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
- * but an additional pin counting system) will be invoked. This is intended for
- * anything that gets a page reference and then touches page data (for example,
- * Direct IO). This lets the filesystem know that some non-file-system entity is
- * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
- * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
- * a call to unpin_user_page().
- *
- * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
- * and separate refcounting mechanisms, however, and that means that each has
- * its own acquire and release mechanisms:
- *
- * FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
- *
- * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
- *
- * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
- * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
- * calls applied to them, and that's perfectly OK. This is a constraint on the
- * callers, not on the pages.)
- *
- * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
- * directly by the caller. That's in order to help avoid mismatches when
- * releasing pages: get_user_pages*() pages must be released via put_page(),
- * while pin_user_pages*() pages must be released via unpin_user_page().
- *
- * Please see Documentation/core-api/pin_user_pages.rst for more information.
- */
-
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
if (vm_fault & VM_FAULT_OOM)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ff3f3f23f649..26dcbda07e92 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void)
return current->in_lru_fault;
}
+#ifdef CONFIG_MEMCG
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return READ_ONCE(lruvec->lrugen.seg);
+}
+#else
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return 0;
+}
+#endif
+
static inline int lru_gen_from_seq(unsigned long seq)
{
return seq % MAX_NR_GENS;
@@ -178,7 +190,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
enum lru_list lru = type * LRU_INACTIVE_FILE;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
@@ -224,7 +236,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
@@ -256,9 +268,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
if (reclaiming)
- list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
else
- list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void)
return false;
}
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
+{
+ return 0;
+}
+
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
return false;
@@ -577,4 +594,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
#endif
}
+static inline bool vma_has_recency(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
+ return false;
+
+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
+ return false;
+
+ return true;
+}
+
#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9757067c3053..10b6eb311ede 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -647,7 +647,7 @@ struct mm_struct {
atomic_t mm_count;
#ifdef CONFIG_MMU
- atomic_long_t pgtables_bytes; /* PTE page table pages */
+ atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
int map_count; /* number of VMAs */
@@ -1085,4 +1085,79 @@ enum fault_flag {
typedef unsigned int __bitwise zap_flags_t;
+/*
+ * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
+ * other. Here is what they mean, and how to use them:
+ *
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite time
+ * period _often_ under userspace control. This is in contrast to
+ * iov_iter_get_pages(), whose usages are transient.
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem. Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed. Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
+ * Currently only get_user_pages() and get_user_pages_fast() support this flag
+ * and calls to get_user_pages_[un]locked are specifically not allowed. This
+ * is due to an incompatibility with the FS DAX check and
+ * FAULT_FLAG_ALLOW_RETRY.
+ *
+ * In the CMA case: long term pins in a CMA region would unnecessarily fragment
+ * that region. And so, CMA attempts to migrate the page before pinning, when
+ * FOLL_LONGTERM is specified.
+ *
+ * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
+ * but an additional pin counting system) will be invoked. This is intended for
+ * anything that gets a page reference and then touches page data (for example,
+ * Direct IO). This lets the filesystem know that some non-file-system entity is
+ * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
+ * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
+ * a call to unpin_user_page().
+ *
+ * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
+ * and separate refcounting mechanisms, however, and that means that each has
+ * its own acquire and release mechanisms:
+ *
+ * FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
+ *
+ * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
+ *
+ * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
+ * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
+ * calls applied to them, and that's perfectly OK. This is a constraint on the
+ * callers, not on the pages.)
+ *
+ * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
+ * directly by the caller. That's in order to help avoid mismatches when
+ * releasing pages: get_user_pages*() pages must be released via put_page(),
+ * while pin_user_pages*() pages must be released via unpin_user_page().
+ *
+ * Please see Documentation/core-api/pin_user_pages.rst for more information.
+ */
+
+#define FOLL_WRITE 0x01 /* check pte is writable */
+#define FOLL_TOUCH 0x02 /* mark page accessed */
+#define FOLL_GET 0x04 /* do get_page on page */
+#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
+#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
+#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
+ * and return without waiting upon it */
+#define FOLL_NOFAULT 0x80 /* do not fault in pages */
+#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
+#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
+#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
+#define FOLL_ANON 0x8000 /* don't do file mappings */
+#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
+#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
+#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */
+#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4..815c7c2edf45 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -7,6 +7,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
@@ -312,7 +313,7 @@ enum lruvec_flags {
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
* corresponding generation. The gen counter in folio->flags stores gen+1 while
- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ * a page is on one of lrugen->folios[]. Otherwise it stores 0.
*
* A page is added to the youngest generation on faulting. The aging needs to
* check the accessed bit at least twice before handing this page over to the
@@ -324,8 +325,8 @@ enum lruvec_flags {
* rest of generations, if they exist, are considered inactive. See
* lru_gen_is_active().
*
- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
- * the aging needs not to worry about it. And it's set again when a page
+ * PG_active is always cleared while a page is on one of lrugen->folios[] so
+ * that the aging needs not to worry about it. And it's set again when a page
* considered active is isolated for non-reclaiming purposes, e.g., migration.
* See lru_gen_add_folio() and lru_gen_del_folio().
*
@@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+/* see the comment on MEMCG_NR_GENS */
+enum {
+ MEMCG_LRU_NOP,
+ MEMCG_LRU_HEAD,
+ MEMCG_LRU_TAIL,
+ MEMCG_LRU_OLD,
+ MEMCG_LRU_YOUNG,
+};
+
#ifdef CONFIG_LRU_GEN
enum {
@@ -404,7 +414,7 @@ enum {
* The number of pages in each generation is eventually consistent and therefore
* can be transiently negative when reset_batch_size() is pending.
*/
-struct lru_gen_struct {
+struct lru_gen_folio {
/* the aging increments the youngest generation number */
unsigned long max_seq;
/* the eviction increments the oldest generation numbers */
@@ -412,7 +422,7 @@ struct lru_gen_struct {
/* the birth time of each generation in jiffies */
unsigned long timestamps[MAX_NR_GENS];
/* the multi-gen LRU lists, lazily sorted on eviction */
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the multi-gen LRU sizes, eventually consistent */
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the exponential moving average of refaulted */
@@ -426,6 +436,14 @@ struct lru_gen_struct {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
bool enabled;
+#ifdef CONFIG_MEMCG
+ /* the memcg generation this lru_gen_folio belongs to */
+ u8 gen;
+ /* the list segment this lru_gen_folio belongs to */
+ u8 seg;
+ /* per-node lru_gen_folio list for global reclaim */
+ struct hlist_nulls_node list;
+#endif
};
enum {
@@ -461,7 +479,7 @@ struct lru_gen_mm_state {
struct lru_gen_mm_walk {
/* the lruvec under reclaim */
struct lruvec *lruvec;
- /* unstable max_seq from lru_gen_struct */
+ /* unstable max_seq from lru_gen_folio */
unsigned long max_seq;
/* the next address within an mm to scan */
unsigned long next_addr;
@@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
+
+/*
+ * For each node, memcgs are divided into two generations: the old and the
+ * young. For each generation, memcgs are randomly sharded into multiple bins
+ * to improve scalability. For each bin, the hlist_nulls is virtually divided
+ * into three segments: the head, the tail and the default.
+ *
+ * An onlining memcg is added to the tail of a random bin in the old generation.
+ * The eviction starts at the head of a random bin in the old generation. The
+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
+ * the old generation, is incremented when all its bins become empty.
+ *
+ * There are four operations:
+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
+ * current generation (old or young) and updates its "seg" to "head";
+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
+ * current generation (old or young) and updates its "seg" to "tail";
+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
+ * generation, updates its "gen" to "old" and resets its "seg" to "default";
+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
+ * young generation, updates its "gen" to "young" and resets its "seg" to
+ * "default".
+ *
+ * The events that trigger the above operations are:
+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
+ * 2. The first attempt to reclaim an memcg below low, which triggers
+ * MEMCG_LRU_TAIL;
+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
+ * which triggers MEMCG_LRU_TAIL;
+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
+ * which triggers MEMCG_LRU_YOUNG;
+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
+ *
+ * Note that memcg LRU only applies to global reclaim, and the round-robin
+ * incrementing of their max_seq counters ensures the eventual fairness to all
+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
+ */
+#define MEMCG_NR_GENS 2
+#define MEMCG_NR_BINS 8
+
+struct lru_gen_memcg {
+ /* the per-node memcg generation counter */
+ unsigned long seq;
+ /* each memcg has one lru_gen_folio per node */
+ unsigned long nr_memcgs[MEMCG_NR_GENS];
+ /* per-node lru_gen_folio list for global reclaim */
+ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
+ /* protects the above */
+ spinlock_t lock;
+};
+
+void lru_gen_init_pgdat(struct pglist_data *pgdat);
+
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
-#endif
+void lru_gen_online_memcg(struct mem_cgroup *memcg);
+void lru_gen_offline_memcg(struct mem_cgroup *memcg);
+void lru_gen_release_memcg(struct mem_cgroup *memcg);
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
+
+#else /* !CONFIG_MEMCG */
+
+#define MEMCG_NR_GENS 1
+
+struct lru_gen_memcg {
+};
+
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+}
+
+#endif /* CONFIG_MEMCG */
#else /* !CONFIG_LRU_GEN */
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+}
+
static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
@@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
#ifdef CONFIG_MEMCG
+
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}
@@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}
-#endif
+
+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
+{
+}
+
+#endif /* CONFIG_MEMCG */
#endif /* CONFIG_LRU_GEN */
@@ -524,7 +635,7 @@ struct lruvec {
unsigned long flags;
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
- struct lru_gen_struct lrugen;
+ struct lru_gen_folio lrugen;
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
#endif
@@ -1243,6 +1354,8 @@ typedef struct pglist_data {
#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
+ /* lru_gen_folio list */
+ struct lru_gen_memcg memcg_lru;
#endif
CACHELINE_PADDING(_pad2_);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 959f52e5867d..27a6df448ee5 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -21,7 +21,16 @@ struct mm_walk;
* depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
* Any folded depths (where PTRS_PER_P?D is equal to 1)
* are skipped.
- * @hugetlb_entry: if set, called for each hugetlb entry
+ * @hugetlb_entry: if set, called for each hugetlb entry. This hook
+ * function is called with the vma lock held, in order to
+ * protect against a concurrent freeing of the pte_t* or
+ * the ptl. In some cases, the hook function needs to drop
+ * and retake the vma lock in order to avoid deadlocks
+ * while calling other functions. In such cases the hook
+ * function must either refrain from accessing the pte or
+ * ptl after dropping the vma lock, or else revalidate
+ * those items after re-acquiring the vma lock and before
+ * accessing them.
* @test_walk: caller specific callback function to determine whether
* we walk over the current vma or not. Returning 0 means
* "do page table walk over the current vma", returning
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 07481bb87d4e..c758809d5bcf 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -16,6 +16,21 @@
struct fs_pin;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+/*
+ * sysctl for vm.memfd_noexec
+ * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
+ * acts like MFD_EXEC was set.
+ * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
+ * acts like MFD_NOEXEC_SEAL was set.
+ * 2: memfd_create() without MFD_NOEXEC_SEAL will be
+ * rejected.
+ */
+#define MEMFD_NOEXEC_SCOPE_EXEC 0
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2
+#endif
+
struct pid_namespace {
struct idr idr;
struct rcu_head rcu;
@@ -31,6 +46,10 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ /* sysctl for vm.memfd_noexec */
+ int memfd_noexec_scope;
+#endif
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 5834bad8ad78..a61e7d55d0d3 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -72,7 +72,7 @@ struct kmem_cache {
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_cache kasan_info;
#endif
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index aa0ee1678d29..f6df03f934e5 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -136,7 +136,7 @@ struct kmem_cache {
unsigned int *random_seq;
#endif
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_cache kasan_info;
#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index db28802ab0a6..c062c581a98b 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -177,6 +177,7 @@ extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
+extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ceed49516ad..209a425739a9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -401,8 +401,8 @@ extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
-extern void deactivate_page(struct page *page);
-extern void mark_page_lazyfree(struct page *page);
+void folio_deactivate(struct folio *folio);
+void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);
extern void lru_cache_add_inactive_or_unevictable(struct page *page,
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index b982dd614572..3a451b7afcb3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
#ifdef CONFIG_HUGETLB_PAGE
-extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
#endif /* CONFIG_HUGETLB_PAGE */
#else /* CONFIG_MIGRATION */
@@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
#ifdef CONFIG_HUGETLB_PAGE
-static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
#endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 9df0b9a762cc..3767f18114ef 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
-extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* mm helpers */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 06f9291b6fd5..2554b71765e9 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -369,8 +369,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb);
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);
-int generic_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
int write_cache_pages(struct address_space *mapping,
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 3d708dae1542..ef75ea606ab2 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -91,12 +91,38 @@ TRACE_EVENT(cma_alloc_start,
__entry->align)
);
-DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
+TRACE_EVENT(cma_alloc_finish,
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
- unsigned long count, unsigned int align),
+ unsigned long count, unsigned int align, int errorno),
- TP_ARGS(name, pfn, page, count, align)
+ TP_ARGS(name, pfn, page, count, align, errorno),
+
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long, pfn)
+ __field(const struct page *, page)
+ __field(unsigned long, count)
+ __field(unsigned int, align)
+ __field(int, errorno)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->pfn = pfn;
+ __entry->page = page;
+ __entry->count = count;
+ __entry->align = align;
+ __entry->errorno = errorno;
+ ),
+
+ TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u errorno=%d",
+ __get_str(name),
+ __entry->pfn,
+ __entry->page,
+ __entry->count,
+ __entry->align,
+ __entry->errorno)
);
DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..e8c07da58c9f 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
+#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
/* (1U << 31) is reserved for signed error codes */
/*
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 7a8a26751c23..273a4e15dfcf 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -8,6 +8,10 @@
#define MFD_CLOEXEC 0x0001U
#define MFD_ALLOW_SEALING 0x0002U
#define MFD_HUGETLB 0x0004U
+/* not executable and sealed to prevent changing to executable. */
+#define MFD_NOEXEC_SEAL 0x0008U
+/* executable */
+#define MFD_EXEC 0x0010U
/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2ac1cd8d23ea..3a934f733136 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3206,7 +3206,7 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
- return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
+ return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
}
static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index f4f8cb0435b4..8a98b1af9376 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
@@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ initialize_memfd_noexec_scope(ns);
+
return ns;
out_free_idr:
@@ -455,6 +458,8 @@ static __init int pid_namespaces_init(void)
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..e22d072e1e24
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ ns->memfd_noexec_scope =
+ task_active_pid_ns(current)->memfd_noexec_scope;
+}
+
+static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+ if (ns != &init_pid_ns)
+ table_copy.data = &ns->memfd_noexec_scope;
+
+ /*
+ * set minimum to current value, the effect is only bigger
+ * value is accepted.
+ */
+ if (*(int *)table_copy.data > *(int *)table_copy.extra1)
+ table_copy.extra1 = table_copy.data;
+
+ return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ { }
+};
+static struct ctl_path vm_path[] = { { .procname = "vm", }, { } };
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl_paths(vm_path, pid_ns_ctl_table_vm);
+}
+#else
+static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 5a976393c9ae..baff62a012e1 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -149,13 +149,12 @@ struct maple_subtree_state {
/* Functions */
static inline struct maple_node *mt_alloc_one(gfp_t gfp)
{
- return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO);
+ return kmem_cache_alloc(maple_node_cache, gfp);
}
static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
{
- return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size,
- nodes);
+ return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
}
static inline void mt_free_bulk(size_t size, void __rcu **nodes)
@@ -183,7 +182,6 @@ static void ma_free_rcu(struct maple_node *node)
call_rcu(&node->rcu, mt_free_rcu);
}
-
static void mas_set_height(struct ma_state *mas)
{
unsigned int new_flags = mas->tree->ma_flags;
@@ -468,7 +466,7 @@ static inline
void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
unsigned char slot)
{
- unsigned long val = (unsigned long) parent;
+ unsigned long val = (unsigned long)parent;
unsigned long shift;
unsigned long type;
enum maple_type p_type = mte_node_type(parent);
@@ -502,10 +500,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
*/
static inline unsigned int mte_parent_slot(const struct maple_enode *enode)
{
- unsigned long val = (unsigned long) mte_to_node(enode)->parent;
+ unsigned long val = (unsigned long)mte_to_node(enode)->parent;
- /* Root. */
- if (val & 1)
+ if (val & MA_ROOT_PARENT)
return 0;
/*
@@ -1128,9 +1125,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
struct maple_alloc *ret, *node = mas->alloc;
unsigned long total = mas_allocated(mas);
+ unsigned int req = mas_alloc_req(mas);
/* nothing or a request pending. */
- if (unlikely(!total))
+ if (WARN_ON(!total))
return NULL;
if (total == 1) {
@@ -1140,27 +1138,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
goto single_node;
}
- if (!node->node_count) {
+ if (node->node_count == 1) {
/* Single allocation in this node. */
mas->alloc = node->slot[0];
- node->slot[0] = NULL;
mas->alloc->total = node->total - 1;
ret = node;
goto new_head;
}
-
node->total--;
- ret = node->slot[node->node_count];
- node->slot[node->node_count--] = NULL;
+ ret = node->slot[--node->node_count];
+ node->slot[node->node_count] = NULL;
single_node:
new_head:
- ret->total = 0;
- ret->node_count = 0;
- if (ret->request_count) {
- mas_set_alloc_req(mas, ret->request_count + 1);
- ret->request_count = 0;
+ if (req) {
+ req++;
+ mas_set_alloc_req(mas, req);
}
+
+ memset(ret, 0, sizeof(*ret));
return (struct maple_node *)ret;
}
@@ -1179,21 +1175,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
unsigned long count;
unsigned int requested = mas_alloc_req(mas);
- memset(reuse, 0, sizeof(*reuse));
count = mas_allocated(mas);
- if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) {
- if (head->slot[0])
- head->node_count++;
- head->slot[head->node_count] = reuse;
+ reuse->request_count = 0;
+ reuse->node_count = 0;
+ if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
+ head->slot[head->node_count++] = reuse;
head->total++;
goto done;
}
reuse->total = 1;
if ((head) && !((unsigned long)head & 0x1)) {
- head->request_count = 0;
reuse->slot[0] = head;
+ reuse->node_count = 1;
reuse->total += head->total;
}
@@ -1212,7 +1207,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
struct maple_alloc *node;
unsigned long allocated = mas_allocated(mas);
- unsigned long success = allocated;
unsigned int requested = mas_alloc_req(mas);
unsigned int count;
void **slots = NULL;
@@ -1228,24 +1222,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
WARN_ON(!allocated);
}
- if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) {
+ if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
node = (struct maple_alloc *)mt_alloc_one(gfp);
if (!node)
goto nomem_one;
- if (allocated)
+ if (allocated) {
node->slot[0] = mas->alloc;
+ node->node_count = 1;
+ } else {
+ node->node_count = 0;
+ }
- success++;
mas->alloc = node;
+ node->total = ++allocated;
requested--;
}
node = mas->alloc;
+ node->request_count = 0;
while (requested) {
max_req = MAPLE_ALLOC_SLOTS;
- if (node->slot[0]) {
- unsigned int offset = node->node_count + 1;
+ if (node->node_count) {
+ unsigned int offset = node->node_count;
slots = (void **)&node->slot[offset];
max_req -= offset;
@@ -1259,15 +1258,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
goto nomem_bulk;
node->node_count += count;
- /* zero indexed. */
- if (slots == (void **)&node->slot)
- node->node_count--;
-
- success += count;
+ allocated += count;
node = node->slot[0];
+ node->node_count = 0;
+ node->request_count = 0;
requested -= count;
}
- mas->alloc->total = success;
+ mas->alloc->total = allocated;
return;
nomem_bulk:
@@ -1276,10 +1273,8 @@ nomem_bulk:
nomem_one:
mas_set_alloc_req(mas, requested);
if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
- mas->alloc->total = success;
+ mas->alloc->total = allocated;
mas_set_err(mas, -ENOMEM);
- return;
-
}
/*
@@ -1334,7 +1329,7 @@ static void mas_node_count(struct ma_state *mas, int count)
* mas_start() - Sets up maple state for operations.
* @mas: The maple state.
*
- * If mas->node == MAS_START, then set the min, max, depth, and offset to
+ * If mas->node == MAS_START, then set the min, max and depth to
* defaults.
*
* Return:
@@ -1348,22 +1343,22 @@ static inline struct maple_enode *mas_start(struct ma_state *mas)
if (likely(mas_is_start(mas))) {
struct maple_enode *root;
- mas->node = MAS_NONE;
mas->min = 0;
mas->max = ULONG_MAX;
mas->depth = 0;
- mas->offset = 0;
root = mas_root(mas);
/* Tree with nodes */
if (likely(xa_is_node(root))) {
mas->depth = 1;
mas->node = mte_safe_root(root);
+ mas->offset = 0;
return NULL;
}
/* empty tree */
if (unlikely(!root)) {
+ mas->node = MAS_NONE;
mas->offset = MAPLE_NODE_SLOTS;
return NULL;
}
@@ -1887,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas,
/* Avoid ending a node on a NULL entry */
split = mab_no_null_split(bn, split, slot_count);
- if (!(*mid_split))
- return split;
- *mid_split = mab_no_null_split(bn, *mid_split, slot_count);
+ if (unlikely(*mid_split))
+ *mid_split = mab_no_null_split(bn, *mid_split, slot_count);
return split;
}
@@ -2947,7 +2941,7 @@ next:
mas->min = prev_min;
mas->max = prev_max;
mas->node = last;
- return (void *) next;
+ return (void *)next;
dead_node:
mas_reset(mas);
@@ -3467,7 +3461,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height,
*/
static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
{
-
struct maple_subtree_state mast;
int height = 0;
unsigned char mid_split, split = 0;
@@ -3893,7 +3886,7 @@ next:
goto dead_node;
} while (!ma_is_leaf(type));
- return (void *) next;
+ return (void *)next;
dead_node:
mas_reset(mas);
@@ -4711,15 +4704,11 @@ found:
static inline void mas_rewalk(struct ma_state *mas, unsigned long index)
{
-
retry:
mas_set(mas, index);
mas_state_walk(mas);
if (mas_is_start(mas))
goto retry;
-
- return;
-
}
/*
@@ -5620,7 +5609,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
mas_reset(wr_mas->mas);
}
}
-
}
/* Interface */
@@ -5745,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
void mas_destroy(struct ma_state *mas)
{
struct maple_alloc *node;
+ unsigned long total;
/*
* When using mas_for_each() to insert an expected number of elements,
@@ -5767,14 +5756,20 @@ void mas_destroy(struct ma_state *mas)
}
mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
- while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) {
+ total = mas_allocated(mas);
+ while (total) {
node = mas->alloc;
mas->alloc = node->slot[0];
- if (node->node_count > 0)
- mt_free_bulk(node->node_count,
- (void __rcu **)&node->slot[1]);
+ if (node->node_count > 1) {
+ size_t count = node->node_count - 1;
+
+ mt_free_bulk(count, (void __rcu **)&node->slot[1]);
+ total -= count;
+ }
kmem_cache_free(maple_node_cache, node);
+ total--;
}
+
mas->alloc = NULL;
}
EXPORT_SYMBOL_GPL(mas_destroy);
@@ -6734,7 +6729,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
if (i < (MAPLE_RANGE64_SLOTS - 1))
last = node->pivot[i];
- else if (!node->slot[i] && max != mt_max[mte_node_type(entry)])
+ else if (!node->slot[i] && max != mt_node_max(entry))
break;
if (last == 0 && i > 0)
break;
@@ -6841,7 +6836,7 @@ void mt_dump(const struct maple_tree *mt)
if (!xa_is_node(entry))
mt_dump_entry(entry, 0, 0, 0);
else if (entry)
- mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0);
+ mt_dump_node(mt, entry, 0, mt_node_max(entry), 0);
}
EXPORT_SYMBOL_GPL(mt_dump);
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index f90d2c27675b..de4ee0d50906 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -38,6 +38,9 @@ __param(int, test_loop_count, 1000000,
__param(int, nr_pages, 0,
"Set number of pages for fix_size_alloc_test(default: 1)");
+__param(bool, use_huge, false,
+ "Use vmalloc_huge in fix_size_alloc_test");
+
__param(int, run_test_mask, INT_MAX,
"Set tests specified in the mask.\n\n"
"\t\tid: 1, name: fix_size_alloc_test\n"
@@ -264,7 +267,10 @@ static int fix_size_alloc_test(void)
int i;
for (i = 0; i < test_loop_count; i++) {
- ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE);
+ if (use_huge)
+ ptr = vmalloc_huge((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE, GFP_KERNEL);
+ else
+ ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE);
if (!ptr)
return -1;
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..39df30dcabe3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1073,7 +1073,7 @@ config GUP_TEST
pin_user_pages*(), or pinned via get_user_pages*(), as specified
by other command line arguments.
- See tools/testing/selftests/vm/gup_test.c
+ See tools/testing/selftests/mm/gup_test.c
comment "GUP_TEST needs to have DEBUG_FS enabled"
depends on !GUP_TEST && !DEBUG_FS
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index fca699ad1fb0..d62f48131952 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,7 +90,7 @@ config PAGE_OWNER
help to find bare alloc_page(s) leaks. Even if you include this
feature on your build, it is disabled in default. You should pass
"page_owner=on" to boot parameter in order to enable it. Eats
- a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ a fair amount of memory if enabled. See tools/mm/page_owner_sort.c
for user-space helper.
If unsure, say N.
diff --git a/mm/cma.c b/mm/cma.c
index 4a978e09547a..a75b17b03b66 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -491,7 +491,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
start = bitmap_no + mask + 1;
}
- trace_cma_alloc_finish(cma->name, pfn, page, count, align);
+ trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
/*
* CMA can allocate multiple page blocks, which results in different
diff --git a/mm/compaction.c b/mm/compaction.c
index ca1603524bbe..62a61de44658 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -122,7 +122,6 @@ bool PageMovable(struct page *page)
return false;
}
-EXPORT_SYMBOL(PageMovable);
void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ceec75b88ef9..1bf0654ae189 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -263,6 +263,40 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
return 0;
}
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+ bool matching)
+{
+ struct damos_filter *filter;
+
+ filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return NULL;
+ filter->type = type;
+ filter->matching = matching;
+ return filter;
+}
+
+void damos_add_filter(struct damos *s, struct damos_filter *f)
+{
+ list_add_tail(&f->list, &s->filters);
+}
+
+static void damos_del_filter(struct damos_filter *f)
+{
+ list_del(&f->list);
+}
+
+static void damos_free_filter(struct damos_filter *f)
+{
+ kfree(f);
+}
+
+void damos_destroy_filter(struct damos_filter *f)
+{
+ damos_del_filter(f);
+ damos_free_filter(f);
+}
+
/* initialize private fields of damos_quota and return the pointer */
static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
{
@@ -287,6 +321,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
return NULL;
scheme->pattern = *pattern;
scheme->action = action;
+ INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -315,6 +350,10 @@ static void damon_free_scheme(struct damos *s)
void damon_destroy_scheme(struct damos *s)
{
+ struct damos_filter *f, *next;
+
+ damos_for_each_filter_safe(f, next, s)
+ damos_destroy_filter(f);
damon_del_scheme(s);
damon_free_scheme(s);
}
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 75409601f934..cc63cf953636 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -16,29 +16,33 @@
* Get an online page for a pfn if it's in the LRU list. Otherwise, returns
* NULL.
*
- * The body of this function is stolen from the 'page_idle_get_page()'. We
+ * The body of this function is stolen from the 'page_idle_get_folio()'. We
* steal rather than reuse it because the code is quite simple.
*/
-struct page *damon_get_page(unsigned long pfn)
+struct folio *damon_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
- if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- return page;
+ return folio;
}
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
{
bool referenced = false;
- struct page *page = damon_get_page(pte_pfn(*pte));
+ struct folio *folio = damon_get_folio(pte_pfn(*pte));
- if (!page)
+ if (!folio)
return;
if (pte_young(*pte)) {
@@ -52,19 +56,19 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
}
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool referenced = false;
- struct page *page = damon_get_page(pmd_pfn(*pmd));
+ struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
- if (!page)
+ if (!folio)
return;
if (pmd_young(*pmd)) {
@@ -78,10 +82,10 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 8d82d3722204..14f4bc69f29b 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -7,7 +7,7 @@
#include <linux/damon.h>
-struct page *damon_get_page(unsigned long pfn);
+struct folio *damon_get_folio(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index e1a4315c4be6..b4df9b9bcc0a 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -33,17 +33,15 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
static void damon_pa_mkold(unsigned long paddr)
{
- struct folio *folio;
- struct page *page = damon_get_page(PHYS_PFN(paddr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
struct rmap_walk_control rwc = {
.rmap_one = __damon_pa_mkold,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page)
+ if (!folio)
return;
- folio = page_folio(page);
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
folio_set_idle(folio);
@@ -81,69 +79,57 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
}
}
-struct damon_pa_access_chk_result {
- unsigned long page_sz;
- bool accessed;
-};
-
static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct damon_pa_access_chk_result *result = arg;
+ bool *accessed = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
- result->accessed = false;
- result->page_sz = PAGE_SIZE;
+ *accessed = false;
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- result->accessed = pte_young(*pvmw.pte) ||
+ *accessed = pte_young(*pvmw.pte) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- result->accessed = pmd_young(*pvmw.pmd) ||
+ *accessed = pmd_young(*pvmw.pmd) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
- result->page_sz = HPAGE_PMD_SIZE;
#else
WARN_ON_ONCE(1);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
- if (result->accessed) {
+ if (*accessed) {
page_vma_mapped_walk_done(&pvmw);
break;
}
}
/* If accessed, stop walking */
- return !result->accessed;
+ return *accessed == false;
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
+static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
{
- struct folio *folio;
- struct page *page = damon_get_page(PHYS_PFN(paddr));
- struct damon_pa_access_chk_result result = {
- .page_sz = PAGE_SIZE,
- .accessed = false,
- };
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+ bool accessed = false;
struct rmap_walk_control rwc = {
- .arg = &result,
+ .arg = &accessed,
.rmap_one = __damon_pa_young,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page)
+ if (!folio)
return false;
- folio = page_folio(page);
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
if (folio_test_idle(folio))
- result.accessed = false;
+ accessed = false;
else
- result.accessed = true;
+ accessed = true;
folio_put(folio);
goto out;
}
@@ -161,25 +147,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
folio_put(folio);
out:
- *page_sz = result.page_sz;
- return result.accessed;
+ *folio_sz = folio_size(folio);
+ return accessed;
}
static void __damon_pa_check_access(struct damon_region *r)
{
static unsigned long last_addr;
- static unsigned long last_page_sz = PAGE_SIZE;
+ static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (ALIGN_DOWN(last_addr, last_page_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_page_sz)) {
+ if (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
if (last_accessed)
r->nr_accesses++;
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz);
+ last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
if (last_accessed)
r->nr_accesses++;
@@ -202,63 +188,116 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static unsigned long damon_pa_pageout(struct damon_region *r)
+static bool __damos_pa_filter_out(struct damos_filter *filter,
+ struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+/*
+ * damos_pa_filter_out - Return true if the page should be filtered out.
+ */
+static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
+{
+ struct damos_filter *filter;
+
+ damos_for_each_filter(filter, scheme) {
+ if (__damos_pa_filter_out(filter, folio))
+ return true;
+ }
+ return false;
+}
+
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
{
unsigned long addr, applied;
- LIST_HEAD(page_list);
+ LIST_HEAD(folio_list);
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
- if (!page)
+ if (damos_pa_filter_out(s, folio)) {
+ folio_put(folio);
continue;
+ }
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
- if (isolate_lru_page(page)) {
- put_page(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
+ if (folio_isolate_lru(folio)) {
+ folio_put(folio);
continue;
}
- if (PageUnevictable(page)) {
- putback_lru_page(page);
+ if (folio_test_unevictable(folio)) {
+ folio_putback_lru(folio);
} else {
- list_add(&page->lru, &page_list);
- put_page(page);
+ list_add(&folio->lru, &folio_list);
+ folio_put(folio);
}
}
- applied = reclaim_pages(&page_list);
+ applied = reclaim_pages(&folio_list);
cond_resched();
return applied * PAGE_SIZE;
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, bool mark_accessed)
+ struct damon_region *r, struct damos *s, bool mark_accessed)
{
unsigned long addr, applied = 0;
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
- if (!page)
+ if (damos_pa_filter_out(s, folio)) {
+ folio_put(folio);
continue;
+ }
+
if (mark_accessed)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
else
- deactivate_page(page);
- put_page(page);
- applied++;
+ folio_deactivate(folio);
+ folio_put(folio);
+ applied += folio_nr_pages(folio);
}
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r,
+ struct damos *s)
{
- return damon_pa_mark_accessed_or_deactivate(r, true);
+ return damon_pa_mark_accessed_or_deactivate(r, s, true);
}
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
+ struct damos *s)
{
- return damon_pa_mark_accessed_or_deactivate(r, false);
+ return damon_pa_mark_accessed_or_deactivate(r, s, false);
}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
@@ -267,11 +306,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r);
+ return damon_pa_pageout(r, scheme);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r);
+ return damon_pa_mark_accessed(r, scheme);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r);
+ return damon_pa_deactivate_pages(r, scheme);
case DAMOS_STAT:
break;
default:
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e82631f39481..648d2a85523a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -99,6 +99,15 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Skip anonymous pages reclamation.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+ * pages. By default, ``N``.
+ */
+static bool skip_anon __read_mostly;
+module_param(skip_anon, bool, 0600);
+
+/*
* PID of the DAMON thread
*
* If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
@@ -142,6 +151,7 @@ static struct damos *damon_reclaim_new_scheme(void)
static int damon_reclaim_apply_parameters(void)
{
struct damos *scheme;
+ struct damos_filter *filter;
int err = 0;
err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
@@ -152,6 +162,15 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (skip_anon) {
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ if (!filter) {
+ /* Will be freed by next 'damon_set_schemes()' below */
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damos_add_filter(scheme, filter);
+ }
damon_set_schemes(ctx, &scheme, 1);
return damon_set_region_biggest_system_ram_default(target,
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 81fc4d27f4e4..86edca66aab1 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -259,6 +259,257 @@ static struct kobj_type damon_sysfs_stats_ktype = {
};
/*
+ * filter directory
+ */
+
+struct damon_sysfs_scheme_filter {
+ struct kobject kobj;
+ enum damos_filter_type type;
+ bool matching;
+ char *memcg_path;
+};
+
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+}
+
+/* Should match with enum damos_filter_type */
+static const char * const damon_sysfs_scheme_filter_type_strs[] = {
+ "anon",
+ "memcg",
+};
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_scheme_filter_type_strs[filter->type]);
+}
+
+static ssize_t type_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ enum damos_filter_type type;
+ ssize_t ret = -EINVAL;
+
+ for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
+ if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
+ type])) {
+ filter->type = type;
+ ret = count;
+ break;
+ }
+ }
+ return ret;
+}
+
+static ssize_t matching_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N');
+}
+
+static ssize_t matching_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool matching;
+ int err = kstrtobool(buf, &matching);
+
+ if (err)
+ return err;
+
+ filter->matching = matching;
+ return count;
+}
+
+static ssize_t memcg_path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ filter->memcg_path ? filter->memcg_path : "");
+}
+
+static ssize_t memcg_path_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+
+ if (!path)
+ return -ENOMEM;
+
+ strscpy(path, buf, count + 1);
+ filter->memcg_path = path;
+ return count;
+}
+
+static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ kfree(filter->memcg_path);
+ kfree(filter);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
+ __ATTR_RW_MODE(type, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
+ __ATTR_RW_MODE(matching, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
+ __ATTR_RW_MODE(memcg_path, 0600);
+
+static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
+ &damon_sysfs_scheme_filter_type_attr.attr,
+ &damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_memcg_path_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
+
+static struct kobj_type damon_sysfs_scheme_filter_ktype = {
+ .release = damon_sysfs_scheme_filter_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filter_groups,
+};
+
+/*
+ * filters directory
+ */
+
+struct damon_sysfs_scheme_filters {
+ struct kobject kobj;
+ struct damon_sysfs_scheme_filter **filters_arr;
+ int nr;
+};
+
+static struct damon_sysfs_scheme_filters *
+damon_sysfs_scheme_filters_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+}
+
+static void damon_sysfs_scheme_filters_rm_dirs(
+ struct damon_sysfs_scheme_filters *filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr;
+ int i;
+
+ for (i = 0; i < filters->nr; i++)
+ kobject_put(&filters_arr[i]->kobj);
+ filters->nr = 0;
+ kfree(filters_arr);
+ filters->filters_arr = NULL;
+}
+
+static int damon_sysfs_scheme_filters_add_dirs(
+ struct damon_sysfs_scheme_filters *filters, int nr_filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr, *filter;
+ int err, i;
+
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ if (!nr_filters)
+ return 0;
+
+ filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!filters_arr)
+ return -ENOMEM;
+ filters->filters_arr = filters_arr;
+
+ for (i = 0; i < nr_filters; i++) {
+ filter = damon_sysfs_scheme_filter_alloc();
+ if (!filter) {
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&filter->kobj,
+ &damon_sysfs_scheme_filter_ktype,
+ &filters->kobj, "%d", i);
+ if (err) {
+ kobject_put(&filter->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return err;
+ }
+
+ filters_arr[i] = filter;
+ filters->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_filters_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filters *filters = container_of(kobj,
+ struct damon_sysfs_scheme_filters, kobj);
+
+ return sysfs_emit(buf, "%d\n", filters->nr);
+}
+
+static ssize_t nr_filters_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filters *filters;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_scheme_filters_add_dirs(filters, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_scheme_filters_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr =
+ __ATTR_RW_MODE(nr_filters, 0600);
+
+static struct attribute *damon_sysfs_scheme_filters_attrs[] = {
+ &damon_sysfs_scheme_filters_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters);
+
+static struct kobj_type damon_sysfs_scheme_filters_ktype = {
+ .release = damon_sysfs_scheme_filters_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filters_groups,
+};
+
+/*
* watermarks directory
*/
@@ -784,6 +1035,7 @@ struct damon_sysfs_scheme {
struct damon_sysfs_access_pattern *access_pattern;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *filters;
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
};
@@ -878,6 +1130,24 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
return err;
}
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_scheme_filters *filters =
+ damon_sysfs_scheme_filters_alloc();
+ int err;
+
+ if (!filters)
+ return -ENOMEM;
+ err = kobject_init_and_add(&filters->kobj,
+ &damon_sysfs_scheme_filters_ktype, &scheme->kobj,
+ "filters");
+ if (err)
+ kobject_put(&filters->kobj);
+ else
+ scheme->filters = filters;
+ return err;
+}
+
static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
{
struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
@@ -926,9 +1196,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_stats(scheme);
+ err = damon_sysfs_scheme_set_filters(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_stats(scheme);
+ if (err)
+ goto put_filters_watermarks_quotas_access_pattern_out;
err = damon_sysfs_scheme_set_tried_regions(scheme);
if (err)
goto put_tried_regions_out;
@@ -937,6 +1210,9 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
put_tried_regions_out:
kobject_put(&scheme->tried_regions->kobj);
scheme->tried_regions = NULL;
+put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
put_watermarks_quotas_access_pattern_out:
kobject_put(&scheme->watermarks->kobj);
scheme->watermarks = NULL;
@@ -956,6 +1232,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
kobject_put(&scheme->watermarks->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
+ kobject_put(&scheme->filters->kobj);
kobject_put(&scheme->stats->kobj);
damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
kobject_put(&scheme->tried_regions->kobj);
@@ -1124,6 +1402,79 @@ struct kobj_type damon_sysfs_schemes_ktype = {
.default_groups = damon_sysfs_schemes_groups,
};
+static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
+ char *memcg_path_buf, char *path)
+{
+#ifdef CONFIG_MEMCG
+ cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
+ if (sysfs_streq(memcg_path_buf, path))
+ return true;
+#endif /* CONFIG_MEMCG */
+ return false;
+}
+
+static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
+{
+ struct mem_cgroup *memcg;
+ char *path;
+ bool found = false;
+
+ if (!memcg_path)
+ return -EINVAL;
+
+ path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+
+ for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
+ memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
+ /* skip removed memcg */
+ if (!mem_cgroup_id(memcg))
+ continue;
+ if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
+ *id = mem_cgroup_id(memcg);
+ found = true;
+ break;
+ }
+ }
+
+ kfree(path);
+ return found ? 0 : -EINVAL;
+}
+
+static int damon_sysfs_set_scheme_filters(struct damos *scheme,
+ struct damon_sysfs_scheme_filters *sysfs_filters)
+{
+ int i;
+ struct damos_filter *filter, *next;
+
+ damos_for_each_filter_safe(filter, next, scheme)
+ damos_destroy_filter(filter);
+
+ for (i = 0; i < sysfs_filters->nr; i++) {
+ struct damon_sysfs_scheme_filter *sysfs_filter =
+ sysfs_filters->filters_arr[i];
+ struct damos_filter *filter =
+ damos_new_filter(sysfs_filter->type,
+ sysfs_filter->matching);
+ int err;
+
+ if (!filter)
+ return -ENOMEM;
+ if (filter->type == DAMOS_FILTER_TYPE_MEMCG) {
+ err = damon_sysfs_memcg_path_to_id(
+ sysfs_filter->memcg_path,
+ &filter->memcg_id);
+ if (err) {
+ damos_destroy_filter(filter);
+ return err;
+ }
+ }
+ damos_add_filter(scheme, filter);
+ }
+ return 0;
+}
+
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
@@ -1132,6 +1483,10 @@ static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ struct damon_sysfs_scheme_filters *sysfs_filters =
+ sysfs_scheme->filters;
+ struct damos *scheme;
+ int err;
struct damos_access_pattern pattern = {
.min_sz_region = access_pattern->sz->min,
@@ -1157,8 +1512,17 @@ static struct damos *damon_sysfs_mk_scheme(
.low = sysfs_wmarks->low,
};
- return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+ scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
&wmarks);
+ if (!scheme)
+ return NULL;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ return scheme;
}
static void damon_sysfs_update_scheme(struct damos *scheme,
@@ -1169,6 +1533,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ int err;
scheme->pattern.min_sz_region = access_pattern->sz->min;
scheme->pattern.max_sz_region = access_pattern->sz->max;
@@ -1191,6 +1556,10 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
scheme->wmarks.high = sysfs_wmarks->high;
scheme->wmarks.mid = sysfs_wmarks->mid;
scheme->wmarks.low = sysfs_wmarks->low;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters);
+ if (err)
+ damon_destroy_scheme(scheme);
}
int damon_sysfs_set_schemes(struct damon_ctx *ctx,
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 15f03df66db6..1fec16d7263e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -335,9 +335,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
{
bool referenced = false;
pte_t entry = huge_ptep_get(pte);
- struct page *page = pte_page(entry);
+ struct folio *folio = pfn_folio(pte_pfn(entry));
- get_page(page);
+ folio_get(folio);
if (pte_young(entry)) {
referenced = true;
@@ -352,10 +352,10 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
}
static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
@@ -422,7 +422,8 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
}
struct damon_young_walk_private {
- unsigned long *page_sz;
+ /* size of the folio for the access checked virtual memory address */
+ unsigned long *folio_sz;
bool young;
};
@@ -431,7 +432,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
{
pte_t *pte;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -446,16 +447,15 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
spin_unlock(ptl);
goto regular_page;
}
- page = damon_get_page(pmd_pfn(*pmd));
- if (!page)
+ folio = damon_get_folio(pmd_pfn(*pmd));
+ if (!folio)
goto huge_out;
- if (pmd_young(*pmd) || !page_is_idle(page) ||
+ if (pmd_young(*pmd) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm,
- addr)) {
- *priv->page_sz = HPAGE_PMD_SIZE;
+ addr))
priv->young = true;
- }
- put_page(page);
+ *priv->folio_sz = HPAGE_PMD_SIZE;
+ folio_put(folio);
huge_out:
spin_unlock(ptl);
return 0;
@@ -469,15 +469,14 @@ regular_page:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!pte_present(*pte))
goto out;
- page = damon_get_page(pte_pfn(*pte));
- if (!page)
+ folio = damon_get_folio(pte_pfn(*pte));
+ if (!folio)
goto out;
- if (pte_young(*pte) || !page_is_idle(page) ||
- mmu_notifier_test_young(walk->mm, addr)) {
- *priv->page_sz = PAGE_SIZE;
+ if (pte_young(*pte) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
- }
- put_page(page);
+ *priv->folio_sz = folio_size(folio);
+ folio_put(folio);
out:
pte_unmap_unlock(pte, ptl);
return 0;
@@ -490,7 +489,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
{
struct damon_young_walk_private *priv = walk->private;
struct hstate *h = hstate_vma(walk->vma);
- struct page *page;
+ struct folio *folio;
spinlock_t *ptl;
pte_t entry;
@@ -499,16 +498,15 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
if (!pte_present(entry))
goto out;
- page = pte_page(entry);
- get_page(page);
+ folio = pfn_folio(pte_pfn(entry));
+ folio_get(folio);
- if (pte_young(entry) || !page_is_idle(page) ||
- mmu_notifier_test_young(walk->mm, addr)) {
- *priv->page_sz = huge_page_size(h);
+ if (pte_young(entry) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
- }
+ *priv->folio_sz = huge_page_size(h);
- put_page(page);
+ folio_put(folio);
out:
spin_unlock(ptl);
@@ -524,10 +522,10 @@ static const struct mm_walk_ops damon_young_ops = {
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
- unsigned long *page_sz)
+ unsigned long *folio_sz)
{
struct damon_young_walk_private arg = {
- .page_sz = page_sz,
+ .folio_sz = folio_sz,
.young = false,
};
@@ -547,18 +545,18 @@ static void __damon_va_check_access(struct mm_struct *mm,
struct damon_region *r, bool same_target)
{
static unsigned long last_addr;
- static unsigned long last_page_sz = PAGE_SIZE;
+ static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
+ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
if (last_accessed)
r->nr_accesses++;
return;
}
- last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz);
+ last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
if (last_accessed)
r->nr_accesses++;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c631ade3f1d2..bb3328f46126 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -15,6 +15,7 @@
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/kconfig.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mm_types.h>
@@ -80,6 +81,7 @@ struct pgtable_debug_args {
unsigned long pmd_pfn;
unsigned long pte_pfn;
+ unsigned long fixed_alignment;
unsigned long fixed_pgd_pfn;
unsigned long fixed_p4d_pfn;
unsigned long fixed_pud_pfn;
@@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!arch_vmap_pmd_supported(args->page_prot))
+ if (!arch_vmap_pmd_supported(args->page_prot) ||
+ args->fixed_alignment < PMD_SIZE)
return;
pr_debug("Validating PMD huge\n");
@@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!arch_vmap_pud_supported(args->page_prot))
+ if (!arch_vmap_pud_supported(args->page_prot) ||
+ args->fixed_alignment < PUD_SIZE)
return;
pr_debug("Validating PUD huge\n");
@@ -1077,10 +1081,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
return page;
}
+/*
+ * Check if a physical memory range described by <pstart, pend> contains
+ * an area that is of size psize, and aligned to psize.
+ *
+ * Don't use address 0, an all-zeroes physical address might mask bugs, and
+ * it's not used on x86.
+ */
+static void __init phys_align_check(phys_addr_t pstart,
+ phys_addr_t pend, unsigned long psize,
+ phys_addr_t *physp, unsigned long *alignp)
+{
+ phys_addr_t aligned_start, aligned_end;
+
+ if (pstart == 0)
+ pstart = PAGE_SIZE;
+
+ aligned_start = ALIGN(pstart, psize);
+ aligned_end = aligned_start + psize;
+
+ if (aligned_end > aligned_start && aligned_end <= pend) {
+ *alignp = psize;
+ *physp = aligned_start;
+ }
+}
+
+static void __init init_fixed_pfns(struct pgtable_debug_args *args)
+{
+ u64 idx;
+ phys_addr_t phys, pstart, pend;
+
+ /*
+ * Initialize the fixed pfns. To do this, try to find a
+ * valid physical range, preferably aligned to PUD_SIZE,
+ * but settling for aligned to PMD_SIZE as a fallback. If
+ * neither of those is found, use the physical address of
+ * the start_kernel symbol.
+ *
+ * The memory doesn't need to be allocated, it just needs to exist
+ * as usable memory. It won't be touched.
+ *
+ * The alignment is recorded, and can be checked to see if we
+ * can run the tests that require an actual valid physical
+ * address range on some architectures ({pmd,pud}_huge_test
+ * on x86).
+ */
+
+ phys = __pa_symbol(&start_kernel);
+ args->fixed_alignment = PAGE_SIZE;
+
+ for_each_mem_range(idx, &pstart, &pend) {
+ /* First check for a PUD-aligned area */
+ phys_align_check(pstart, pend, PUD_SIZE, &phys,
+ &args->fixed_alignment);
+
+ /* If a PUD-aligned area is found, we're done */
+ if (args->fixed_alignment == PUD_SIZE)
+ break;
+
+ /*
+ * If no PMD-aligned area found yet, check for one,
+ * but continue the loop to look for a PUD-aligned area.
+ */
+ if (args->fixed_alignment < PMD_SIZE)
+ phys_align_check(pstart, pend, PMD_SIZE, &phys,
+ &args->fixed_alignment);
+ }
+
+ args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+ args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+ args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+ args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+ args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+ WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+}
+
+
static int __init init_args(struct pgtable_debug_args *args)
{
struct page *page = NULL;
- phys_addr_t phys;
int ret = 0;
/*
@@ -1160,22 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args)
args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
WARN_ON(!args->start_ptep);
- /*
- * PFN for mapping at PTE level is determined from a standard kernel
- * text symbol. But pfns for higher page table levels are derived by
- * masking lower bits of this real pfn. These derived pfns might not
- * exist on the platform but that does not really matter as pfn_pxx()
- * helpers will still create appropriate entries for the test. This
- * helps avoid large memory block allocations to be used for mapping
- * at higher page table levels in some of the tests.
- */
- phys = __pa_symbol(&start_kernel);
- args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
- args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
- args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
- args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
- args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
- WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+ init_fixed_pfns(args);
/*
* Allocate (huge) pages because some of the tests need to access
diff --git a/mm/fadvise.c b/mm/fadvise.c
index bf04fec87f35..fb7c5f43fd2a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_NOREUSE;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_DONTNEED:
__filemap_fdatawrite_range(mapping, offset, endbyte,
diff --git a/mm/hmm.c b/mm/hmm.c
index 601a99ce3c84..6a151c09de5e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
+ int ret;
+
spin_unlock(ptl);
- return hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_unlock_read(vma);
+ /*
+ * Avoid deadlock: drop the vma lock before calling
+ * hmm_vma_fault(), which will itself potentially take and
+ * drop the vma lock. This is also correct from a
+ * protection point of view, because there is no further
+ * use here of either pte or ptl after dropping the vma
+ * lock.
+ */
+ ret = hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_lock_read(vma);
+ return ret;
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa..7e68a36b4f7d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1603,7 +1603,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pmd_t orig_pmd;
- struct page *page;
+ struct folio *folio;
struct mm_struct *mm = tlb->mm;
bool ret = false;
@@ -1623,15 +1623,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
/*
- * If other processes are mapping this page, we couldn't discard
- * the page unless they all do MADV_FREE so let's skip the page.
+ * If other processes are mapping this folio, we couldn't discard
+ * the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (total_mapcount(page) != 1)
+ if (folio_mapcount(folio) != 1)
goto out;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
/*
@@ -1639,17 +1639,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* will deactivate only them.
*/
if (next - addr != HPAGE_PMD_SIZE) {
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
goto out_unlocked;
}
- if (PageDirty(page))
- ClearPageDirty(page);
- unlock_page(page);
+ if (folio_test_dirty(folio))
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
@@ -1660,7 +1660,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- mark_page_lazyfree(page);
+ folio_mark_lazyfree(folio);
ret = true;
out:
spin_unlock(ptl);
@@ -1920,17 +1920,15 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
entry = pmd_modify(oldpmd, newprot);
- if (uffd_wp) {
- entry = pmd_wrprotect(entry);
+ if (uffd_wp)
entry = pmd_mkuffd_wp(entry);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
/*
* Leave the write bit to be handled by PF interrupt
* handler, then things like COW could be properly
* handled.
*/
entry = pmd_clear_uffd_wp(entry);
- }
/* See change_pte_range(). */
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
@@ -2837,6 +2835,9 @@ void deferred_split_huge_page(struct page *page)
if (PageSwapCache(page))
return;
+ if (!list_empty(page_deferred_list(page)))
+ return;
+
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
@@ -2934,6 +2935,7 @@ static void split_huge_pages_all(void)
{
struct zone *zone;
struct page *page;
+ struct folio *folio;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
@@ -2946,24 +2948,32 @@ static void split_huge_pages_all(void)
int nr_pages;
page = pfn_to_online_page(pfn);
- if (!page || !get_page_unless_zero(page))
+ if (!page || PageTail(page))
+ continue;
+ folio = page_folio(page);
+ if (!folio_try_get(folio))
continue;
- if (zone != page_zone(page))
+ if (unlikely(page_folio(page) != folio))
goto next;
- if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
+ if (zone != folio_zone(folio))
+ goto next;
+
+ if (!folio_test_large(folio)
+ || folio_test_hugetlb(folio)
+ || !folio_test_lru(folio))
goto next;
total++;
- lock_page(page);
- nr_pages = thp_nr_pages(page);
- if (!split_huge_page(page))
+ folio_lock(folio);
+ nr_pages = folio_nr_pages(folio);
+ if (!split_folio(folio))
split++;
pfn += nr_pages - 1;
- unlock_page(page);
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
cond_resched();
}
}
@@ -3275,7 +3285,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
- pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ pmde = pmd_mkuffd_wp(pmde);
if (!is_migration_entry_young(entry))
pmde = pmd_mkold(pmde);
/* NOTE: this may contain setting soft-dirty on some archs */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bdbfeb6fb393..6fe65f14d33b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -260,12 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
/*
* hugetlb vma_lock helper routines
*/
-static bool __vma_shareable_lock(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
- vma->vm_private_data;
-}
-
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
@@ -1492,7 +1486,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
set_page_refcounted(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
}
@@ -1956,7 +1950,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
__folio_clear_reserved(folio);
__folio_set_head(folio);
/* we rely on prep_new_hugetlb_folio to set the destructor */
- folio_set_compound_order(folio, order);
+ folio_set_order(folio, order);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -2020,7 +2014,7 @@ out_error:
p = folio_page(folio, j);
__ClearPageReserved(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
return false;
}
@@ -4981,7 +4975,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else {
/*
* For shared mappings the vma lock must be held before
- * calling huge_pte_offset in the src vma. Otherwise, the
+ * calling hugetlb_walk() in the src vma. Otherwise, the
* returned ptep could go away if part of a shared pmd and
* another thread calls huge_pmd_unshare.
*/
@@ -4991,7 +4985,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr, sz);
+ src_pte = hugetlb_walk(src_vma, addr, sz);
if (!src_pte) {
addr |= last_addr_mask;
continue;
@@ -5198,7 +5192,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
- src_pte = huge_pte_offset(mm, old_addr, sz);
+ src_pte = hugetlb_walk(vma, old_addr, sz);
if (!src_pte) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
@@ -5261,7 +5255,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep) {
address |= last_addr_mask;
continue;
@@ -5574,7 +5568,7 @@ retry_avoidcopy:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -5612,7 +5606,7 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
@@ -5919,7 +5913,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* if populated.
*/
if (unlikely(pte_marker_uffd_wp(old_pte)))
- new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
+ new_pte = huge_pte_mkuffd_wp(new_pte);
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
@@ -5994,22 +5988,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -6024,10 +6002,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something has changed.
*/
hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
@@ -6056,8 +6030,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release the hugetlb fault lock now, but retain
+ * the vma lock, because it is needed to guard the
+ * huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
* If we are going to COW/unshare the mapping later, we examine the
@@ -6402,10 +6391,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
-retry:
- pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ hugetlb_vma_lock_read(vma);
+ pte = hugetlb_walk(vma, haddr, huge_page_size(h));
if (!pte)
- return NULL;
+ goto out_unlock;
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
@@ -6425,19 +6414,11 @@ retry:
page = NULL;
goto out;
}
- } else {
- if (is_hugetlb_entry_migration(entry)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(pte, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
}
out:
spin_unlock(ptl);
+out_unlock:
+ hugetlb_vma_unlock_read(vma);
return page;
}
@@ -6468,6 +6449,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
+ hugetlb_vma_lock_read(vma);
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
@@ -6475,8 +6457,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
- huge_page_size(h));
+ pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -6492,6 +6474,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
break;
}
@@ -6513,6 +6496,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
@@ -6575,6 +6560,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
continue;
}
@@ -6604,6 +6590,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
err = -ENOMEM;
break;
@@ -6615,6 +6602,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
i += refs;
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
*nr_pages = remainder;
/*
@@ -6627,7 +6615,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6636,7 +6624,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0, psize = huge_page_size(h);
+ long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
@@ -6661,7 +6649,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, psize);
+ ptep = hugetlb_walk(vma, address, psize);
if (!ptep) {
if (!uffd_wp) {
address |= last_addr_mask;
@@ -6672,8 +6660,10 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* pre-allocations to install pte markers.
*/
ptep = huge_pte_alloc(mm, vma, address, psize);
- if (!ptep)
+ if (!ptep) {
+ pages = -ENOMEM;
break;
+ }
}
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, address, ptep)) {
@@ -6728,7 +6718,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (uffd_wp)
- pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ pte = huge_pte_mkuffd_wp(pte);
else if (uffd_wp_resolve)
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
@@ -6763,7 +6753,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
- return pages << h->order;
+ return pages > 0 ? (pages << h->order) : pages;
}
/* Return true if reservation was successful, false otherwise. */
@@ -6772,7 +6762,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg, add = -1;
+ long chg = -1, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7071,8 +7061,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr,
- vma_mmu_pagesize(svma));
+ spte = hugetlb_walk(svma, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -7384,7 +7374,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032d..1d6f4e168510 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -378,6 +378,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset);
+/*
+ * This will have no effect, other than possibly generating a warning, if the
+ * caller passes in a non-large folio.
+ */
+static inline void folio_set_order(struct folio *folio, unsigned int order)
+{
+ if (WARN_ON_ONCE(!folio_test_large(folio)))
+ return;
+
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ /*
+ * When hugetlb dissolves a folio, we need to clear the tail
+ * page, rather than setting nr_pages to 1.
+ */
+ folio->_folio_nr_pages = order ? 1U << order : 0;
+#endif
+}
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 833bf2cfd2a3..6b8e9c848573 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -95,19 +95,24 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
}
#endif /* CONFIG_KASAN_STACK */
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
if (unlikely(PageHighMem(page)))
- return;
+ return false;
+
+ if (!kasan_sample_page_alloc(order))
+ return false;
tag = kasan_random_tag();
kasan_unpoison(set_tag(page_address(page), tag),
PAGE_SIZE << order, init);
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
+
+ return true;
}
void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
@@ -117,11 +122,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
KASAN_PAGE_FREE, init);
}
-void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
-{
- cache->kasan_info.is_kmalloc = true;
-}
-
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -321,7 +321,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
- if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc)
+ if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache))
kasan_save_alloc_info(cache, tagged_object, flags);
return tagged_object;
@@ -367,7 +367,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
* Save alloc info (if possible) for kmalloc() allocations.
* This also rewrites the alloc info when called from kasan_krealloc().
*/
- if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc)
+ if (kasan_stack_collection_enabled() && is_kmalloc_cache(cache))
kasan_save_alloc_info(cache, (void *)object, flags);
/* Keep the tag that was set by kasan_slab_alloc(). */
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index b22c4f461cb0..d1bcb0205327 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -59,6 +59,24 @@ EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to enable vmalloc tagging. */
DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+#define PAGE_ALLOC_SAMPLE_DEFAULT 1
+#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3
+
+/*
+ * Sampling interval of page_alloc allocation (un)poisoning.
+ * Defaults to no sampling.
+ */
+unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+
+/*
+ * Minimum order of page_alloc allocations to be affected by sampling.
+ * The default value is chosen to match both
+ * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER.
+ */
+unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+
+DEFINE_PER_CPU(long, kasan_page_alloc_skip);
+
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -122,6 +140,48 @@ static inline const char *kasan_mode_info(void)
return "sync";
}
+/* kasan.page_alloc.sample=<sampling interval> */
+static int __init early_kasan_flag_page_alloc_sample(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtoul(arg, 0, &kasan_page_alloc_sample);
+ if (rv)
+ return rv;
+
+ if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) {
+ kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample);
+
+/* kasan.page_alloc.sample.order=<minimum page order> */
+static int __init early_kasan_flag_page_alloc_sample_order(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order);
+ if (rv)
+ return rv;
+
+ if (kasan_page_alloc_sample_order > INT_MAX) {
+ kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order);
+
/*
* kasan_init_hw_tags_cpu() is called for each CPU.
* Not marked as __init as a CPU can be hot-plugged after boot.
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea8cf1310b1e..32413f22aa82 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -42,6 +42,10 @@ enum kasan_mode {
extern enum kasan_mode kasan_mode __ro_after_init;
+extern unsigned long kasan_page_alloc_sample;
+extern unsigned int kasan_page_alloc_sample_order;
+DECLARE_PER_CPU(long, kasan_page_alloc_skip);
+
static inline bool kasan_vmalloc_enabled(void)
{
return static_branch_likely(&kasan_flag_vmalloc);
@@ -57,6 +61,24 @@ static inline bool kasan_sync_fault_possible(void)
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ /* Fast-path for when sampling is disabled. */
+ if (kasan_page_alloc_sample == 1)
+ return true;
+
+ if (order < kasan_page_alloc_sample_order)
+ return true;
+
+ if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) {
+ this_cpu_write(kasan_page_alloc_skip,
+ kasan_page_alloc_sample - 1);
+ return true;
+ }
+
+ return false;
+}
+
#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_async_fault_possible(void)
@@ -69,6 +91,11 @@ static inline bool kasan_sync_fault_possible(void)
return true;
}
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ return true;
+}
+
#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_GENERIC
diff --git a/mm/madvise.c b/mm/madvise.c
index b6ea204d4e23..e407d335e614 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -345,8 +345,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
- struct page *page = NULL;
- LIST_HEAD(page_list);
+ struct folio *folio = NULL;
+ LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
if (fatal_signal_pending(current))
@@ -375,26 +375,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /* Do not interfere with other mappings of this folio */
+ if (folio_mapcount(folio) != 1)
goto huge_unlock;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
goto huge_unlock;
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (!err)
- goto regular_page;
+ goto regular_folio;
return 0;
}
@@ -406,25 +406,25 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (!folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
return 0;
}
-regular_page:
+regular_folio:
if (pmd_trans_unstable(pmd))
return 0;
#endif
@@ -441,33 +441,33 @@ regular_page:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
* Creating a THP page is expensive so split it only if we
* are sure it's worth. Split it if we are only owner.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ if (folio_mapcount(folio) != 1)
break;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
break;
}
pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (split_folio(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
break;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
@@ -475,16 +475,16 @@ regular_page:
}
/*
- * Do not interfere with other mappings of this page and
- * non-LRU page.
+ * Do not interfere with other mappings of this folio and
+ * non-LRU folio.
*/
- if (!PageLRU(page) || page_mapcount(page) != 1)
+ if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
continue;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
if (pte_young(ptent)) {
ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -495,28 +495,28 @@ regular_page:
}
/*
- * We are deactivating a page for accelerating reclaiming.
- * VM couldn't reclaim the page unless we clear PG_young.
+ * We are deactivating a folio for accelerating reclaiming.
+ * VM couldn't reclaim the folio unless we clear PG_young.
* As a side effect, it makes confuse idle-page tracking
* because they will miss recent referenced history.
*/
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (!folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
cond_resched();
return 0;
@@ -728,7 +728,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(&folio->page);
+ folio_mark_lazyfree(folio);
}
out:
if (nr_swap) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 73afff8062f9..faeea84964aa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -477,6 +477,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
+ if (lru_gen_enabled()) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
+
+ return;
+ }
+
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
@@ -2939,13 +2949,13 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
}
/*
- * page_memcg_check() is used here, because in theory we can encounter
+ * folio_memcg_check() is used here, because in theory we can encounter
* a folio where the slab flag has been cleared already, but
* slab->memcg_data has not been freed yet
- * page_memcg_check(page) will guarantee that a proper memory
+ * folio_memcg_check() will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
- return page_memcg_check(folio_page(folio, 0));
+ return folio_memcg_check(folio);
}
/*
@@ -3526,6 +3536,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
+ if (lru_gen_enabled())
+ return 0;
+
if (order > 0)
return 0;
@@ -3914,6 +3927,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
if (val & ~MOVE_MASK)
return -EINVAL;
@@ -5382,6 +5399,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
+ lru_gen_online_memcg(memcg);
return 0;
offline_kmem:
memcg_offline_kmem(memcg);
@@ -5413,6 +5431,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
+ lru_gen_offline_memcg(memcg);
drain_all_stock(memcg);
@@ -5424,6 +5443,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
invalidate_reclaim_iterators(memcg);
+ lru_gen_release_memcg(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -5687,7 +5707,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must make sure the page is not on LRU (isolate_page() is useful.)
+ * The page must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
@@ -5704,20 +5724,13 @@ static int mem_cgroup_move_account(struct page *page,
int nid, ret;
VM_BUG_ON(from == to);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON(compound && !folio_test_large(folio));
- /*
- * Prevent mem_cgroup_migrate() from looking at
- * page's memory cgroup of its source page while we change it.
- */
- ret = -EBUSY;
- if (!folio_trylock(folio))
- goto out;
-
ret = -EINVAL;
if (folio_memcg(folio) != from)
- goto out_unlock;
+ goto out;
pgdat = folio_pgdat(folio);
from_vec = mem_cgroup_lruvec(from, pgdat);
@@ -5804,8 +5817,6 @@ static int mem_cgroup_move_account(struct page *page,
mem_cgroup_charge_statistics(from, -nr_pages);
memcg_check_events(from, nid);
local_irq_enable();
-out_unlock:
- folio_unlock(folio);
out:
return ret;
}
@@ -5854,6 +5865,29 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
+ if (target && page) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ret;
+ }
+ /*
+ * page_mapped() must be stable during the move. This
+ * pte is locked, so if it's present, the page cannot
+ * become unmapped. If it isn't, we have only partial
+ * control over the mapped state: the page lock will
+ * prevent new faults against pagecache and swapcache,
+ * so an unmapped page cannot become mapped. However,
+ * if the page is already mapped elsewhere, it can
+ * unmap, and there is nothing we can do about it.
+ * Alas, skip moving the page in this case.
+ */
+ if (!pte_present(ptent) && page_mapped(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+ }
+
if (!page && !ent.val)
return ret;
if (page) {
@@ -5870,8 +5904,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (target)
target->page = page;
}
- if (!ret || !target)
+ if (!ret || !target) {
+ if (target)
+ unlock_page(page);
put_page(page);
+ }
}
/*
* There is a swap entry and a page doesn't exist or isn't charged.
@@ -5911,6 +5948,10 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ return MC_TARGET_NONE;
+ }
target->page = page;
}
}
@@ -6149,6 +6190,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
putback_lru_page(page);
}
+ unlock_page(page);
put_page(page);
} else if (target_type == MC_TARGET_DEVICE) {
page = target.page;
@@ -6157,6 +6199,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
+ unlock_page(page);
put_page(page);
}
spin_unlock(ptl);
@@ -6199,7 +6242,8 @@ retry:
}
if (!device)
putback_lru_page(page);
-put: /* get_mctgt_type() gets the page */
+put: /* get_mctgt_type() gets & locks the page */
+ unlock_page(page);
put_page(page);
break;
case MC_TARGET_SWAP:
diff --git a/mm/memfd.c b/mm/memfd.c
index 08f5f8304746..a0a7a37e8177 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -18,6 +18,7 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
+#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
/*
@@ -147,6 +148,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_EXEC | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
@@ -175,6 +177,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
+ * SEAL_EXEC: Prevent modification of the exec bits in the file mode
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
@@ -219,6 +222,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
}
+ /*
+ * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ */
+ if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
+ seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
+
*file_seals |= seals;
error = 0;
@@ -261,12 +270,13 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
+ char comm[TASK_COMM_LEN];
unsigned int *file_seals;
struct file *file;
int fd, error;
@@ -283,6 +293,40 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}
+ /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
+ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
+ return -EINVAL;
+
+ if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+#ifdef CONFIG_SYSCTL
+ int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
+ struct pid_namespace *ns;
+
+ ns = task_active_pid_ns(current);
+ if (ns)
+ sysctl = ns->memfd_noexec_scope;
+
+ switch (sysctl) {
+ case MEMFD_NOEXEC_SCOPE_EXEC:
+ flags |= MFD_EXEC;
+ break;
+ case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
+ flags |= MFD_NOEXEC_SEAL;
+ break;
+ default:
+ pr_warn_once(
+ "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
+ task_pid_nr(current), get_task_comm(comm, current));
+ return -EINVAL;
+ }
+#else
+ flags |= MFD_EXEC;
+#endif
+ pr_warn_once(
+ "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
+ task_pid_nr(current), get_task_comm(comm, current));
+ }
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
@@ -326,7 +370,15 @@ SYSCALL_DEFINE2(memfd_create,
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING) {
+ if (flags & MFD_NOEXEC_SEAL) {
+ struct inode *inode = file_inode(file);
+
+ inode->i_mode &= ~0111;
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ } else if (flags & MFD_ALLOW_SEALING) {
+ /* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
*file_seals &= ~F_SEAL_SEAL;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..6bf07345ea2c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -24,7 +24,7 @@
* - You have a test that can be added to mce-test
* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
* - The case actually shows up as a frequent (top 10) page state in
- * tools/vm/page-types when running a real workload.
+ * tools/mm/page-types when running a real workload.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
diff --git a/mm/memory.c b/mm/memory.c
index 3e836fecd035..90f8f72777c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -625,6 +625,16 @@ out:
return pfn_to_page(pfn);
}
+struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ struct page *page = vm_normal_page(vma, addr, pte);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -878,7 +888,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, *src_pte))
/* Uffd-wp needs to be delivered to dest pte as well */
- pte = pte_wrprotect(pte_mkuffd_wp(pte));
+ pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -1392,8 +1402,7 @@ again:
force_flush = 1;
}
}
- if (pte_young(ptent) &&
- likely(!(vma->vm_flags & VM_SEQ_READ)))
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
@@ -1684,36 +1693,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
}
/**
- * zap_page_range - remove user pages in a given range
- * @vma: vm_area_struct holding the applicable pages
- * @start: starting address of pages to zap
- * @size: number of bytes to zap
- *
- * Caller must protect the VMA list
- */
-void zap_page_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long size)
-{
- struct maple_tree *mt = &vma->vm_mm->mm_mt;
- unsigned long end = start + size;
- struct mmu_notifier_range range;
- struct mmu_gather tlb;
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
- lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- start, start + size);
- tlb_gather_mmu(&tlb, vma->vm_mm);
- update_hiwater_rss(vma->vm_mm);
- mmu_notifier_invalidate_range_start(&range);
- do {
- unmap_single_vma(&tlb, vma, start, range.end, NULL);
- } while ((vma = mas_find(&mas, end - 1)) != NULL);
- mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-}
-
-/**
* zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
@@ -3950,10 +3929,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ if (pte_swp_uffd_wp(vmf->orig_pte))
pte = pte_mkuffd_wp(pte);
- pte = pte_wrprotect(pte);
- }
vmf->orig_pte = pte;
/* ksm created a completely new copy */
@@ -4296,7 +4273,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (unlikely(uffd_wp))
- entry = pte_mkuffd_wp(pte_wrprotect(entry));
+ entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
@@ -5137,8 +5114,8 @@ static inline void mm_account_fault(struct pt_regs *regs,
#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
- /* the LRU algorithm doesn't apply to sequential or random reads */
- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+ /* the LRU algorithm only applies to accesses with recency */
+ current->in_lru_fault = vma_has_recency(vma);
}
static void lru_gen_exit_fault(void)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f940395667c8..72142fbe7652 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -632,13 +632,12 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct mmu_gather tlb;
- int nr_updated;
+ long nr_updated;
tlb_gather_mmu(&tlb, vma->vm_mm);
- nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
- MM_CP_PROT_NUMA);
- if (nr_updated)
+ nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
+ if (nr_updated > 0)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
tlb_finish_mmu(&tlb);
@@ -1490,7 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct mempolicy *new;
+ struct mempolicy *new, *old;
unsigned long vmstart;
unsigned long vmend;
unsigned long end;
@@ -1522,31 +1521,27 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
return 0;
mmap_write_lock(mm);
for_each_vma_range(vmi, vma, end) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
- new = mpol_dup(vma_policy(vma));
- if (IS_ERR(new)) {
- err = PTR_ERR(new);
- break;
- }
- /*
- * Only update home node if there is an existing vma policy
- */
- if (!new)
- continue;
-
/*
* If any vma in the range got policy other than MPOL_BIND
* or MPOL_PREFERRED_MANY we return error. We don't reset
* the home node for vmas we already updated before.
*/
- if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
- mpol_put(new);
+ old = vma_policy(vma);
+ if (!old)
+ continue;
+ if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
}
+ new = mpol_dup(old);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ break;
+ }
new->home_node = home_node;
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
err = mbind_range(mm, vmstart, vmend, new);
mpol_put(new);
if (err)
diff --git a/mm/migrate.c b/mm/migrate.c
index a4d3fc65085f..98de7ce2b576 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
}
#ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+/*
+ * The vma read lock must be held upon entry. Holding that lock prevents either
+ * the pte or the ptl from being freed.
+ *
+ * This function will release the vma lock before returning.
+ */
+void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl)
{
pte_t pte;
+ hugetlb_vma_assert_locked(vma);
spin_lock(ptl);
pte = huge_ptep_get(ptep);
- if (unlikely(!is_hugetlb_entry_migration(pte)))
+ if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl);
- else
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ /*
+ * If migration entry existed, safe to release vma lock
+ * here because the pgtable page won't be freed without the
+ * pgtable lock released. See comment right above pgtable
+ * lock release in migration_entry_wait_on_locked().
+ */
+ hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ }
}
void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
- __migration_entry_wait_huge(pte, ptl);
+ __migration_entry_wait_huge(vma, pte, ptl);
}
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 61cf60015a8b..92fc6f3fa512 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return pte_dirty(pte);
}
-static unsigned long change_pte_range(struct mmu_gather *tlb,
+static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
- unsigned long pages = 0;
+ long pages = 0;
int target_node = NUMA_NO_NODE;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -177,12 +177,10 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
oldpte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_modify(oldpte, newprot);
- if (uffd_wp) {
- ptent = pte_wrprotect(ptent);
+ if (uffd_wp)
ptent = pte_mkuffd_wp(ptent);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
ptent = pte_clear_uffd_wp(ptent);
- }
/*
* In some writable, shared mappings, we might want
@@ -332,36 +330,42 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
/*
* If wr-protecting the range for file-backed, populate pgtable for the case
* when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
- * failed it means no memory, we don't have a better option but stop.
+ * failed we treat it the same way as pgtable allocation failures during
+ * page faults by kicking OOM and returning error.
*/
#define change_pmd_prepare(vma, pmd, cp_flags) \
- do { \
+ ({ \
+ long err = 0; \
if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
- if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \
- break; \
+ if (pte_alloc(vma->vm_mm, pmd)) \
+ err = -ENOMEM; \
} \
- } while (0)
+ err; \
+ })
+
/*
* This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
* have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
* while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
*/
#define change_prepare(vma, high, low, addr, cp_flags) \
- do { \
+ ({ \
+ long err = 0; \
if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
- if (WARN_ON_ONCE(p == NULL)) \
- break; \
+ if (p == NULL) \
+ err = -ENOMEM; \
} \
- } while (0)
+ err; \
+ })
-static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
+static inline long change_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
unsigned long nr_huge_updates = 0;
struct mmu_notifier_range range;
@@ -369,11 +373,15 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
+ long ret;
next = pmd_addr_end(addr, end);
- change_pmd_prepare(vma, pmd, cp_flags);
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
/*
* Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
@@ -403,7 +411,11 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
* cleared; make sure pmd populated if
* necessary, then fall-through to pte level.
*/
- change_pmd_prepare(vma, pmd, cp_flags);
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
} else {
/*
* change_huge_pmd() does not defer TLB flushes,
@@ -424,9 +436,8 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
}
/* fall through, the trans huge pmd just split */
}
- this_pages = change_pte_range(tlb, vma, pmd, addr, next,
- newprot, cp_flags);
- pages += this_pages;
+ pages += change_pte_range(tlb, vma, pmd, addr, next,
+ newprot, cp_flags);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
@@ -439,18 +450,20 @@ next:
return pages;
}
-static inline unsigned long change_pud_range(struct mmu_gather *tlb,
+static inline long change_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- change_prepare(vma, pud, pmd, addr, cp_flags);
+ ret = change_prepare(vma, pud, pmd, addr, cp_flags);
+ if (ret)
+ return ret;
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
@@ -460,18 +473,20 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb,
return pages;
}
-static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
+static inline long change_p4d_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- change_prepare(vma, p4d, pud, addr, cp_flags);
+ ret = change_prepare(vma, p4d, pud, addr, cp_flags);
+ if (ret)
+ return ret;
if (p4d_none_or_clear_bad(p4d))
continue;
pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
@@ -481,21 +496,25 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
return pages;
}
-static unsigned long change_protection_range(struct mmu_gather *tlb,
+static long change_protection_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
tlb_start_vma(tlb, vma);
do {
next = pgd_addr_end(addr, end);
- change_prepare(vma, pgd, p4d, addr, cp_flags);
+ ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
@@ -507,15 +526,27 @@ static unsigned long change_protection_range(struct mmu_gather *tlb,
return pages;
}
-unsigned long change_protection(struct mmu_gather *tlb,
+long change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+ unsigned long end, unsigned long cp_flags)
{
- unsigned long pages;
+ pgprot_t newprot = vma->vm_page_prot;
+ long pages;
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
+ * are expected to reflect their requirements via VMA flags such that
+ * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
+ */
+ if (cp_flags & MM_CP_PROT_NUMA)
+ newprot = PAGE_NONE;
+#else
+ WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
+#endif
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot,
cp_flags);
@@ -644,7 +675,7 @@ success:
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
- change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags);
+ change_protection(tlb, vma, start, end, mm_cp_flags);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
diff --git a/mm/nommu.c b/mm/nommu.c
index 5b83938ecb67..df1711acdf5b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -892,29 +892,36 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
- /* vm_flags |= mm->def_flags; */
- if (!(capabilities & NOMMU_MAP_DIRECT)) {
- /* attempt to share read-only copies of mapped file chunks */
+ if (!file) {
+ /*
+ * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because
+ * there is no fork().
+ */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (file && !(prot & PROT_WRITE))
- vm_flags |= VM_MAYSHARE;
+ } else if (flags & MAP_PRIVATE) {
+ /* MAP_PRIVATE file mapping */
+ if (capabilities & NOMMU_MAP_DIRECT)
+ vm_flags |= (capabilities & NOMMU_VMFLAGS);
+ else
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (!(prot & PROT_WRITE) && !current->ptrace)
+ /*
+ * R/O private file mapping which cannot be used to
+ * modify memory, especially also not via active ptrace
+ * (e.g., set breakpoints) or later by upgrading
+ * permissions (no mprotect()). We can try overlaying
+ * the file mapping, which will work e.g., on chardevs,
+ * ramfs/tmpfs/shmfs and romfs/cramf.
+ */
+ vm_flags |= VM_MAYOVERLAY;
} else {
- /* overlay a shareable mapping on the backing device or inode
- * if possible - used for chardevs, ramfs/tmpfs/shmfs and
- * romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
- if (flags & MAP_SHARED)
- vm_flags |= VM_SHARED;
+ /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */
+ vm_flags |= VM_SHARED | VM_MAYSHARE |
+ (capabilities & NOMMU_VMFLAGS);
}
- /* refuse to let anyone share private mappings with this process if
- * it's being traced - otherwise breakpoints set in it may interfere
- * with another untraced process
- */
- if ((flags & MAP_PRIVATE) && current->ptrace)
- vm_flags &= ~VM_MAYSHARE;
-
return vm_flags;
}
@@ -952,15 +959,18 @@ static int do_mmap_private(struct vm_area_struct *vma,
void *base;
int ret, order;
- /* invoke the file's mapping function so that it can keep track of
- * shared mappings on devices or memory
- * - VM_MAYSHARE will be set if it may attempt to share
+ /*
+ * Invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory. VM_MAYOVERLAY will be set if
+ * it may attempt to share, which will make is_nommu_shared_mapping()
+ * happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma);
+ /* shouldn't return success if we're not sharing */
+ if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
+ ret = -ENOSYS;
if (ret == 0) {
- /* shouldn't return success if we're not sharing */
- BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
}
@@ -1106,7 +1116,7 @@ unsigned long do_mmap(struct file *file,
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
- if (vm_flags & VM_MAYSHARE) {
+ if (is_nommu_shared_mapping(vm_flags)) {
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;
@@ -1116,7 +1126,7 @@ unsigned long do_mmap(struct file *file,
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);
- if (!(pregion->vm_flags & VM_MAYSHARE))
+ if (!is_nommu_shared_mapping(pregion->vm_flags))
continue;
/* search for overlapping mappings on the same file */
@@ -1600,7 +1610,7 @@ static unsigned long do_mremap(unsigned long addr,
if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
- if (vma->vm_flags & VM_MAYSHARE)
+ if (is_nommu_shared_mapping(vma->vm_flags))
return (unsigned long) -EPERM;
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ad608ef2a243..e91f94b3438b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2526,12 +2526,8 @@ continue_unlock:
}
EXPORT_SYMBOL(write_cache_pages);
-/*
- * Function used by generic_writepages to call the real writepage
- * function and set the mapping flags on error
- */
-static int __writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static int writepage_cb(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct address_space *mapping = data;
int ret = mapping->a_ops->writepage(page, wbc);
@@ -2539,34 +2535,6 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
return ret;
}
-/**
- * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int generic_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct blk_plug plug;
- int ret;
-
- /* deal with chardevs and other special file */
- if (!mapping->a_ops->writepage)
- return 0;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __writepage, mapping);
- blk_finish_plug(&plug);
- return ret;
-}
-
-EXPORT_SYMBOL(generic_writepages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2577,11 +2545,20 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages)
+ if (mapping->a_ops->writepages) {
ret = mapping->a_ops->writepages(mapping, wbc);
- else
- ret = generic_writepages(mapping, wbc);
- if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
+ } else if (mapping->a_ops->writepage) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, writepage_cb,
+ mapping);
+ blk_finish_plug(&plug);
+ } else {
+ /* deal with chardevs and other special files */
+ ret = 0;
+ }
+ if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
/*
@@ -2713,7 +2690,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
*
* The caller must hold lock_page_memcg(). Most callers have the folio
* locked. A few have the folio blocked from truncation through other
- * means (eg zap_page_range() has it mapped and is holding the page table
+ * means (eg zap_vma_pages() has it mapped and is holding the page table
* lock). This can also be called from mark_buffer_dirty(), which I
* cannot prove is always protected against truncate.
*/
@@ -2846,11 +2823,11 @@ bool folio_mark_dirty(struct folio *folio)
if (likely(mapping)) {
/*
- * readahead/lru_deactivate_page could remain
+ * readahead/folio_deactivate could remain
* PG_readahead/PG_reclaim due to race with folio_end_writeback
* About readahead, if the folio is written, the flags would be
* reset. So no problem.
- * About lru_deactivate_page, if the folio is redirtied,
+ * About folio_deactivate, if the folio is redirtied,
* the flag will be reset. So no problem. but if the
* folio is used by readahead it will confuse readahead
* and make it restart the size rampup process. But it's
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0745aedebb37..5514d84cc712 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -443,15 +443,15 @@ static inline bool deferred_pages_enabled(void)
return static_branch_unlikely(&deferred_pages);
}
-/* Returns true if the struct page for the pfn is uninitialised */
-static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+/* Returns true if the struct page for the pfn is initialised */
+static inline bool __meminit early_page_initialised(unsigned long pfn)
{
int nid = early_pfn_to_nid(pfn);
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
+ return false;
- return false;
+ return true;
}
/*
@@ -498,9 +498,9 @@ static inline bool deferred_pages_enabled(void)
return false;
}
-static inline bool early_page_uninitialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn)
{
- return false;
+ return true;
}
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
@@ -1356,6 +1356,8 @@ out:
* see the comment next to it.
* 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
* see the comment next to it.
+ * 4. The allocation is excluded from being checked due to sampling,
+ * see the call to kasan_unpoison_pages.
*
* Poisoning pages during deferred memory init will greatly lengthen the
* process and cause problem in large memory systems as the deferred pages
@@ -1641,7 +1643,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
pg_data_t *pgdat;
int nid, zid;
- if (!early_page_uninitialised(pfn))
+ if (early_page_initialised(pfn))
return;
nid = early_pfn_to_nid(pfn);
@@ -1804,7 +1806,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (early_page_uninitialised(pfn))
+ if (!early_page_initialised(pfn))
return;
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
@@ -2468,7 +2470,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
{
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
- bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ bool reset_tags = !zero_tags;
int i;
set_page_private(page, 0);
@@ -2491,30 +2494,42 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
*/
/*
- * If memory tags should be zeroed (which happens only when memory
- * should be initialized as well).
+ * If memory tags should be zeroed
+ * (which happens only when memory should be initialized as well).
*/
- if (init_tags) {
+ if (zero_tags) {
/* Initialize both memory and tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
- /* Note that memory is already initialized by the loop above. */
+ /* Take note that memory was initialized by the loop above. */
init = false;
}
if (!should_skip_kasan_unpoison(gfp_flags)) {
- /* Unpoison shadow memory or set memory tags. */
- kasan_unpoison_pages(page, order, init);
-
- /* Note that memory is already initialized by KASAN. */
- if (kasan_has_integrated_init())
- init = false;
- } else {
- /* Ensure page_address() dereferencing does not fault. */
+ /* Try unpoisoning (or setting tags) and initializing memory. */
+ if (kasan_unpoison_pages(page, order, init)) {
+ /* Take note that memory was initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
+ /* Take note that memory tags were set by KASAN. */
+ reset_tags = false;
+ } else {
+ /*
+ * KASAN decided to exclude this allocation from being
+ * poisoned due to sampling. Skip poisoning as well.
+ */
+ SetPageSkipKASanPoison(page);
+ }
+ }
+ /*
+ * If memory tags have not been set, reset the page tags to ensure
+ * page_address() dereferencing does not fault.
+ */
+ if (reset_tags) {
for (i = 0; i != 1 << order; ++i)
page_kasan_tag_reset(page + i);
}
- /* If memory is still not initialized, do it now. */
+ /* If memory is still not initialized, initialize it now. */
if (init)
kernel_init_pages(page, 1 << order);
/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
@@ -7926,6 +7941,7 @@ static void __init free_area_init_node(int nid)
pgdat_set_deferred_range(pgdat);
free_area_init_core(pgdat);
+ lru_gen_init_pgdat(pgdat);
}
static void __init free_area_init_memoryless_node(int nid)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index bc08332a609c..41ea77f22011 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -31,19 +31,22 @@
*
* This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct folio *page_idle_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
- if (!page || !PageLRU(page) ||
- !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- return page;
+ return folio;
}
static bool page_idle_clear_pte_refs_one(struct folio *folio,
@@ -83,10 +86,8 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
return true;
}
-static void page_idle_clear_pte_refs(struct page *page)
+static void page_idle_clear_pte_refs(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
/*
* Since rwc.try_lock is unused, rwc is effectively immutable, so we
* can make it static to save some cycles and stack.
@@ -115,7 +116,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -134,19 +135,19 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ if (folio_test_idle(folio)) {
/*
* The page might have been referenced via a
* pte, in which case it is not idle. Clear
* refs and recheck.
*/
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
+ page_idle_clear_pte_refs(folio);
+ if (folio_test_idle(folio))
*out |= 1ULL << bit;
}
- put_page(page);
+ folio_put(folio);
}
if (bit == BITMAP_CHUNK_BITS - 1)
out++;
@@ -160,7 +161,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -178,11 +179,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
- if (page) {
- page_idle_clear_pte_refs(page);
- set_page_idle(page);
- put_page(page);
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ page_idle_clear_pte_refs(folio);
+ folio_set_idle(folio);
+ folio_put(folio);
}
}
if (bit == BITMAP_CHUNK_BITS - 1)
diff --git a/mm/page_io.c b/mm/page_io.c
index 3a5f921b932e..905d9fcc0c96 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,7 +18,6 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 79a8554f024c..c65813a9dc78 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -356,7 +356,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
mutex_lock(&page_reporting_mutex);
/* nothing to do if already in use */
- if (rcu_access_pointer(pr_dev_info)) {
+ if (rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
err = -EBUSY;
goto err_out;
}
@@ -401,7 +402,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
{
mutex_lock(&page_reporting_mutex);
- if (rcu_access_pointer(pr_dev_info) == prdev) {
+ if (prdev == rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
/* Disable page reporting notification */
RCU_INIT_POINTER(pr_dev_info, NULL);
synchronize_rcu();
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 93e13fc17d3c..4e448cfbc6ef 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
/* The only possible mapping was handled on last iteration */
if (pvmw->pte)
return not_found(pvmw);
-
- /* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
+ /*
+ * All callers that get here will already hold the
+ * i_mmap_rwsem. Therefore, no additional locks need to be
+ * taken before calling hugetlb_walk().
+ */
+ pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7f1c9b274906..cb23f8a15c13 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -302,18 +302,18 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ hugetlb_vma_lock_read(vma);
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask, sz);
-
+ pte = hugetlb_walk(vma, addr & hmask, sz);
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
err = ops->pte_hole(addr, next, -1, walk);
-
if (err)
break;
} while (addr = next, addr != end);
+ hugetlb_vma_unlock_read(vma);
return err;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index b616870a09be..ab74e0547a52 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
- if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
lru_gen_look_around(&pvmw);
referenced++;
}
if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
- * Don't treat a reference through
- * a sequentially read mapping as such.
- * If the folio has been used in another mapping,
- * we will catch it; if this other mapping is
- * already gone, the unmap path will have set
- * the referenced flag or activated the folio.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
- referenced++;
- }
+ pvmw.pte))
+ referenced++;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
@@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
struct folio_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
- if (!mm_match_cgroup(vma->vm_mm, memcg))
+ /*
+ * Ignore references from this mapping if it has no recency. If the
+ * folio has been used in another mapping, we will catch it; if this
+ * other mapping is already gone, the unmap path will have set the
+ * referenced flag or activated the folio in zap_pte_range().
+ */
+ if (!vma_has_recency(vma))
+ return true;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
+ * of references from different cgroups.
+ */
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
return true;
return false;
@@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
.try_lock = true,
+ .invalid_vma = invalid_folio_referenced_vma,
};
*vm_flags = 0;
@@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked,
return 1;
}
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg) {
- rwc.invalid_vma = invalid_folio_referenced_vma;
- }
-
rmap_walk(folio, &rwc);
*vm_flags = pra.vm_flags;
@@ -1222,9 +1216,6 @@ void page_add_anon_rmap(struct page *page,
bool compound = flags & RMAP_COMPOUND;
bool first = true;
- if (unlikely(PageKsm(page)))
- lock_page_memcg(page);
-
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
first = atomic_inc_and_test(&page->_mapcount);
@@ -1262,15 +1253,14 @@ void page_add_anon_rmap(struct page *page,
if (nr)
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
- if (unlikely(PageKsm(page)))
- unlock_page_memcg(page);
-
- /* address might be in next vma when migration races vma_adjust */
- else if (first)
- __page_set_anon_rmap(page, vma, address,
- !!(flags & RMAP_EXCLUSIVE));
- else
- __page_check_anon_rmap(page, vma, address);
+ if (likely(!PageKsm(page))) {
+ /* address might be in next vma when migration races vma_adjust */
+ if (first)
+ __page_set_anon_rmap(page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
+ else
+ __page_check_anon_rmap(page, vma, address);
+ }
mlock_vma_page(page, vma, compound);
}
@@ -1329,7 +1319,6 @@ void page_add_file_rmap(struct page *page,
bool first;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
- lock_page_memcg(page);
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
@@ -1365,7 +1354,6 @@ void page_add_file_rmap(struct page *page,
NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
if (nr)
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
- unlock_page_memcg(page);
mlock_vma_page(page, vma, compound);
}
@@ -1394,8 +1382,6 @@ void page_remove_rmap(struct page *page,
return;
}
- lock_page_memcg(page);
-
/* Is page being unmapped by PTE? Is this its last map to be removed? */
if (likely(!compound)) {
last = atomic_add_negative(-1, &page->_mapcount);
@@ -1451,8 +1437,6 @@ void page_remove_rmap(struct page *page,
* and remember that it's only reliable while mapped.
*/
- unlock_page_memcg(page);
-
munlock_vma_page(page, vma, compound);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 0005ab2c29af..bc5c156ef470 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1093,6 +1093,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
if (error)
return error;
+ if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
+ if ((inode->i_mode ^ attr->ia_mode) & 0111) {
+ return -EPERM;
+ }
+ }
+
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1733,6 +1739,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+ struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
int error;
@@ -1744,6 +1751,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (is_swapin_error_entry(swap))
return -EIO;
+ si = get_swap_device(swap);
+ if (!si) {
+ if (!shmem_confirm_swap(mapping, index, swap))
+ return -EEXIST;
+ else
+ return -EINVAL;
+ }
+
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
if (!folio) {
@@ -1804,6 +1819,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
delete_from_swap_cache(folio);
folio_mark_dirty(folio);
swap_free(swap);
+ put_swap_device(si);
*foliop = folio;
return 0;
@@ -1817,6 +1833,7 @@ unlock:
folio_unlock(folio);
folio_put(folio);
}
+ put_swap_device(si);
return error;
}
diff --git a/mm/slab.c b/mm/slab.c
index 7a269db050ee..b77be9c6d6b1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1373,7 +1373,7 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0)))
+ if (sk_memalloc_socks() && folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
return slab;
diff --git a/mm/slab.h b/mm/slab.h
index 7cc432969945..63fb4c00d529 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -323,6 +323,14 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
}
#endif
+static inline bool is_kmalloc_cache(struct kmem_cache *s)
+{
+#ifndef CONFIG_SLOB
+ return (s->flags & SLAB_KMALLOC);
+#else
+ return false;
+#endif
+}
/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1cba98acc486..bf4e777cfe90 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -670,7 +670,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
usersize);
- kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
diff --git a/mm/slub.c b/mm/slub.c
index 13459c69095a..67020074ecb4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1859,7 +1859,7 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
__folio_set_slab(folio);
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
- if (page_is_pfmemalloc(folio_page(folio, 0)))
+ if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
return slab;
diff --git a/mm/swap.c b/mm/swap.c
index 70e2063ef43a..e54e2a252e27 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -733,17 +733,15 @@ void deactivate_file_folio(struct folio *folio)
}
/*
- * deactivate_page - deactivate a page
- * @page: page to deactivate
+ * folio_deactivate - deactivate a folio
+ * @folio: folio to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * folio_deactivate() moves @folio to the inactive list if @folio was on the
+ * active list and was not unevictable. This is done to accelerate the
+ * reclaim of @folio.
*/
-void deactivate_page(struct page *page)
+void folio_deactivate(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
(folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
@@ -757,16 +755,14 @@ void deactivate_page(struct page *page)
}
/**
- * mark_page_lazyfree - make an anon page lazyfree
- * @page: page to deactivate
+ * folio_mark_lazyfree - make an anon folio lazyfree
+ * @folio: folio to deactivate
*
- * mark_page_lazyfree() moves @page to the inactive file list.
- * This is done to accelerate the reclaim of @page.
+ * folio_mark_lazyfree() moves @folio to the inactive file list.
+ * This is done to accelerate the reclaim of @folio.
*/
-void mark_page_lazyfree(struct page *page)
+void folio_mark_lazyfree(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_lru(folio) && folio_test_anon(folio) &&
folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
!folio_test_unevictable(folio)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2927507b43d8..cb9aaa00951d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -321,19 +321,15 @@ static inline bool swap_use_vma_readahead(void)
* unlocked and with its refcount incremented - we rely on the kernel
* lock getting page table operations atomic even if we drop the folio
* lock before returning.
+ *
+ * Caller must lock the swap device or hold a reference to keep it valid.
*/
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
struct folio *folio;
- struct swap_info_struct *si;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
- put_swap_device(si);
-
if (folio) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -693,28 +689,15 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
-static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
- unsigned long faddr,
- unsigned long lpfn,
- unsigned long rpfn,
- unsigned long *start,
- unsigned long *end)
-{
- *start = max3(lpfn, PFN_DOWN(vma->vm_start),
- PFN_DOWN(faddr & PMD_MASK));
- *end = min3(rpfn, PFN_DOWN(vma->vm_end),
- PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
-}
-
static void swap_ra_info(struct vm_fault *vmf,
- struct vma_swap_readahead *ra_info)
+ struct vma_swap_readahead *ra_info)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
- unsigned long faddr, pfn, fpfn;
+ unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
- unsigned int max_win, hits, prev_win, win, left;
+ unsigned int max_win, hits, prev_win, win;
#ifndef CONFIG_64BIT
pte_t *tpte;
#endif
@@ -727,8 +710,6 @@ static void swap_ra_info(struct vm_fault *vmf,
}
faddr = vmf->address;
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
@@ -739,22 +720,28 @@ static void swap_ra_info(struct vm_fault *vmf,
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
- if (win == 1) {
- pte_unmap(orig_pte);
+ if (win == 1)
return;
- }
/* Copy the PTEs because the page table may be unmapped */
- if (fpfn == pfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
- else if (pfn == fpfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
- &start, &end);
- else {
- left = (win - 1) / 2;
- swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
- &start, &end);
+ orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
+ if (fpfn == pfn + 1) {
+ lpfn = fpfn;
+ rpfn = fpfn + win;
+ } else if (pfn == fpfn + 1) {
+ lpfn = fpfn - win + 1;
+ rpfn = fpfn + 1;
+ } else {
+ unsigned int left = (win - 1) / 2;
+
+ lpfn = fpfn - left;
+ rpfn = fpfn + win - left;
}
+ start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
pte -= ra_info->offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4fa440e87cd6..af151679d13a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1836,13 +1836,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t *pte;
struct swap_info_struct *si;
int ret = 0;
- volatile unsigned char *swap_map;
si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
struct folio *folio;
unsigned long offset;
+ unsigned char swp_count;
if (!is_swap_pte(*pte))
continue;
@@ -1853,7 +1853,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
offset = swp_offset(entry);
pte_unmap(pte);
- swap_map = &si->swap_map[offset];
folio = swap_cache_get_folio(entry, vma, addr);
if (!folio) {
struct page *page;
@@ -1870,8 +1869,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
folio = page_folio(page);
}
if (!folio) {
- if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ swp_count = READ_ONCE(si->swap_map[offset]);
+ if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
goto try_next;
+
return -ENOMEM;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0499907b6f1a..53c3d916ff66 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -74,24 +74,10 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
_dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
-
- /*
- * Always mark a PTE as write-protected when needed, regardless of
- * VM_WRITE, which the user might change.
- */
- if (wp_copy) {
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- writable = false;
- }
-
if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
- else
- /*
- * We need this to make sure write bit removed; as mk_pte()
- * could return a pte with write bit set.
- */
- _dst_pte = pte_wrprotect(_dst_pte);
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
@@ -724,21 +710,31 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
mmap_changing, 0);
}
-void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
+ unsigned int mm_cp_flags;
struct mmu_gather tlb;
- pgprot_t newprot;
+ long ret;
if (enable_wp)
- newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ mm_cp_flags = MM_CP_UFFD_WP;
else
- newprot = vm_get_page_prot(dst_vma->vm_flags);
+ mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
+ /*
+ * vma->vm_page_prot already reflects that uffd-wp is enabled for this
+ * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
+ * to be write-protected as default whenever protection changes.
+ * Try upgrading write permissions manually.
+ */
+ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
tlb_gather_mmu(&tlb, dst_mm);
- change_protection(&tlb, dst_vma, start, start + len, newprot,
- enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+ ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);
+
+ return ret;
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
@@ -747,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
{
struct vm_area_struct *dst_vma;
unsigned long page_mask;
- int err;
+ long err;
/*
* Sanitize the command parameters:
@@ -786,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock;
}
- uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+ err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+
+ /* Return 0 on success, <0 on failures */
+ if (err > 0)
+ err = 0;
- err = 0;
out_unlock:
mmap_read_unlock(dst_mm);
return err;
diff --git a/mm/util.c b/mm/util.c
index b56c92fb910f..cec9327b27b4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -120,7 +120,8 @@ EXPORT_SYMBOL(kstrndup);
* @len: memory region length
* @gfp: GFP mask to use
*
- * Return: newly allocated copy of @src or %NULL in case of error
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -134,6 +135,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kvmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result may be not physically contiguous. Use kvfree() to free.
+ */
+void *kvmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kvmalloc(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kvmemdup);
+
+/**
* kmemdup_nul - Create a NUL-terminated string from unterminated data
* @s: The data to stringify
* @len: The size of the data
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..428e0bee5c9c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1815,9 +1815,9 @@ static void drain_vmap_area_work(struct work_struct *work)
}
/*
- * Free a vmap area, caller ensuring that the area has been unmapped
- * and flush_cache_vunmap had been called for the correct range
- * previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped,
+ * unlinked and flush_cache_vunmap had been called for the correct
+ * range previously.
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
@@ -1825,9 +1825,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
unsigned long va_start = va->va_start;
unsigned long nr_lazy;
- spin_lock(&vmap_area_lock);
- unlink_va(va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ if (WARN_ON_ONCE(!list_empty(&va->list)))
+ return;
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
@@ -1871,6 +1870,19 @@ struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
+static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr, &vmap_area_root);
+ if (va)
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
/*** Per cpu kva allocator ***/
/*
@@ -2015,6 +2027,10 @@ static void free_vmap_block(struct vmap_block *vb)
tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
+ spin_lock(&vmap_area_lock);
+ unlink_va(vb->va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
@@ -2236,8 +2252,10 @@ void vm_unmap_ram(const void *mem, unsigned int count)
return;
}
- va = find_vmap_area(addr);
- BUG_ON(!va);
+ va = find_unlink_vmap_area(addr);
+ if (WARN_ON_ONCE(!va))
+ return;
+
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
free_unmap_vmap_area(va);
@@ -2591,6 +2609,20 @@ struct vm_struct *find_vm_area(const void *addr)
return va->vm;
}
+static struct vm_struct *__remove_vm_area(struct vmap_area *va)
+{
+ struct vm_struct *vm;
+
+ if (!va || !va->vm)
+ return NULL;
+
+ vm = va->vm;
+ kasan_free_module_shadow(vm);
+ free_unmap_vmap_area(va);
+
+ return vm;
+}
+
/**
* remove_vm_area - find and remove a continuous kernel virtual area
* @addr: base address
@@ -2603,26 +2635,10 @@ struct vm_struct *find_vm_area(const void *addr)
*/
struct vm_struct *remove_vm_area(const void *addr)
{
- struct vmap_area *va;
-
might_sleep();
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
- if (va && va->vm) {
- struct vm_struct *vm = va->vm;
-
- va->vm = NULL;
- spin_unlock(&vmap_area_lock);
-
- kasan_free_module_shadow(vm);
- free_unmap_vmap_area(va);
-
- return vm;
- }
-
- spin_unlock(&vmap_area_lock);
- return NULL;
+ return __remove_vm_area(
+ find_unlink_vmap_area((unsigned long) addr));
}
static inline void set_area_direct_map(const struct vm_struct *area,
@@ -2636,16 +2652,17 @@ static inline void set_area_direct_map(const struct vm_struct *area,
set_direct_map(area->pages[i]);
}
-/* Handle removing and resetting vm mappings related to the vm_struct. */
-static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+/* Handle removing and resetting vm mappings related to the VA's vm_struct. */
+static void va_remove_mappings(struct vmap_area *va, int deallocate_pages)
{
+ struct vm_struct *area = va->vm;
unsigned long start = ULONG_MAX, end = 0;
unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
- remove_vm_area(area->addr);
+ __remove_vm_area(va);
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
if (!flush_reset)
@@ -2690,6 +2707,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
+ struct vmap_area *va;
if (!addr)
return;
@@ -2698,19 +2716,20 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = find_vm_area(addr);
- if (unlikely(!area)) {
+ va = find_unlink_vmap_area((unsigned long)addr);
+ if (unlikely(!va)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
+ area = va->vm;
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
- vm_remove_mappings(area, deallocate_pages);
+ va_remove_mappings(va, deallocate_pages);
if (deallocate_pages) {
int i;
@@ -3031,7 +3050,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int ret;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
- gfp_mask |= __GFP_NOWARN;
+
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf3eedf0209c..394ff4962cbc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,8 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
+#include <linux/rculist_nulls.h>
+#include <linux/random.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -135,12 +137,6 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
-#ifdef CONFIG_LRU_GEN
- /* help kswapd make better choices among multiple memcgs */
- unsigned int memcgs_need_aging:1;
- unsigned long last_reclaimed;
-#endif
-
/* Allocation order */
s8 order;
@@ -449,6 +445,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
return sc->target_mem_cgroup;
}
+static bool global_reclaim(struct scan_control *sc)
+{
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
+}
+
/**
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
@@ -499,6 +500,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
return false;
}
+static bool global_reclaim(struct scan_control *sc)
+{
+ return true;
+}
+
static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
@@ -1920,7 +1926,7 @@ retry:
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
- * Similar in principle to deactivate_page()
+ * Similar in principle to folio_deactivate()
* except we already have the folio isolated
* and know it's dirty
*/
@@ -3176,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
+
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -3201,6 +3210,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ if (!sc->may_swap)
+ return 0;
+
if (!can_demote(pgdat->node_id, sc) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3215,7 +3227,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_struct */
+ /* see the comment on lru_gen_folio */
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
@@ -3615,7 +3627,7 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
pos->refaulted = lrugen->avg_refaulted[type][tier] +
@@ -3630,7 +3642,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
{
int hist, tier;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
@@ -3707,7 +3719,7 @@ static int folio_update_gen(struct folio *folio, int gen)
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
int type = folio_is_file_lru(folio);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
@@ -3752,7 +3764,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
{
int gen, type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
walk->batched = 0;
@@ -3785,7 +3797,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
if (is_vm_hugetlb_page(vma))
return true;
- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
+ if (!vma_has_recency(vma))
+ return true;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
return true;
if (vma == get_gate_vma(vma->vm_mm))
@@ -4230,7 +4245,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
} while (err == -EAGAIN);
}
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
@@ -4238,7 +4253,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
VM_WARN_ON_ONCE(walk);
walk = &pgdat->mm_walk;
- } else if (!pgdat && !walk) {
+ } else if (!walk && force_alloc) {
VM_WARN_ON_ONCE(current_is_kswapd());
walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -4266,7 +4281,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
int zone;
int remaining = MAX_LRU_BATCH;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
if (type == LRU_GEN_ANON && !can_swap)
@@ -4274,7 +4289,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
/* prevent cold/hot inversion if force_scan is true */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- struct list_head *head = &lrugen->lists[old_gen][type][zone];
+ struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
@@ -4285,7 +4300,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
new_gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+ list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
if (!--remaining)
return false;
@@ -4302,7 +4317,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
{
int gen, type, zone;
bool success = false;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -4313,7 +4328,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
gen = lru_gen_from_seq(min_seq[type]);
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- if (!list_empty(&lrugen->lists[gen][type][zone]))
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
goto next;
}
@@ -4323,7 +4338,7 @@ next:
;
}
- /* see the comment on lru_gen_struct */
+ /* see the comment on lru_gen_folio */
if (can_swap) {
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
@@ -4345,7 +4360,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
spin_lock_irq(&lruvec->lru_lock);
@@ -4403,7 +4418,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
bool success;
struct lru_gen_mm_walk *walk;
struct mm_struct *mm = NULL;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
@@ -4419,12 +4434,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
- walk = set_mm_walk(NULL);
+ walk = set_mm_walk(NULL, true);
if (!walk) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
@@ -4447,8 +4462,7 @@ done:
if (sc->priority <= DEF_PRIORITY - 2)
wait_event_killable(lruvec->mm_state.wait,
max_seq < READ_ONCE(lrugen->max_seq));
-
- return max_seq < READ_ONCE(lrugen->max_seq);
+ return false;
}
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
@@ -4461,97 +4475,52 @@ done:
return true;
}
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
unsigned long total = 0;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ bool can_swap = get_swappiness(lruvec, sc);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
for (type = !can_swap; type < ANON_AND_FILE; type++) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
}
}
- /* try to scrape all its memory if this memcg was deleted */
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
- return true;
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ /* whether the size is big enough to be helpful */
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
}
-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long min_ttl)
{
- bool need_aging;
- unsigned long nr_to_scan;
- int swappiness = get_swappiness(lruvec, sc);
+ int gen;
+ unsigned long birth;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- mem_cgroup_calculate_protection(NULL, memcg);
-
- if (mem_cgroup_below_min(NULL, memcg))
+ if (time_is_after_jiffies(birth + min_ttl))
return false;
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
-
- if (min_ttl) {
- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
- return false;
-
- /* the size is likely too small to be helpful */
- if (!nr_to_scan && sc->priority != DEF_PRIORITY)
- return false;
- }
+ if (!lruvec_is_sizable(lruvec, sc))
+ return false;
- if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+ mem_cgroup_calculate_protection(NULL, memcg);
- return true;
+ return !mem_cgroup_below_min(NULL, memcg);
}
/* to protect the working set of the last N jiffies */
@@ -4560,46 +4529,30 @@ static unsigned long lru_gen_min_ttl __read_mostly;
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
- bool success = false;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd());
- sc->last_reclaimed = sc->nr_reclaimed;
-
- /*
- * To reduce the chance of going into the aging path, which can be
- * costly, optimistically skip it if the flag below was cleared in the
- * eviction path. This improves the overall performance when multiple
- * memcgs are available.
- */
- if (!sc->memcgs_need_aging) {
- sc->memcgs_need_aging = true;
+ /* check the order to exclude compaction-induced reclaim */
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
return;
- }
-
- set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (age_lruvec(lruvec, sc, min_ttl))
- success = true;
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ return;
+ }
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- clear_mm_walk();
-
- /* check the order to exclude compaction-induced reclaim */
- if (success || !min_ttl || sc->order)
- return;
-
/*
* The main goal is to OOM kill if every generation from all memcgs is
* younger than min_ttl. However, another possibility is all memcgs are
- * either below min or empty.
+ * either too small or below min.
*/
if (mutex_trylock(&oom_lock)) {
struct oom_control oc = {
@@ -4753,7 +4706,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
int tier = lru_tier_from_refs(refs);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4778,7 +4731,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
/* promoted */
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
- list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -4787,7 +4740,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
lrugen->protected[hist][type][tier - 1] + delta);
@@ -4799,7 +4752,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
if (folio_test_locked(folio) || folio_test_writeback(folio) ||
(type == LRU_GEN_FILE && folio_test_dirty(folio))) {
gen = folio_inc_gen(lruvec, folio, true);
- list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -4810,12 +4763,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* unmapping inhibited */
- if (!sc->may_unmap && folio_mapped(folio))
- return false;
-
/* swapping inhibited */
- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ if (!(sc->gfp_mask & __GFP_IO) &&
(folio_test_dirty(folio) ||
(folio_test_anon(folio) && !folio_test_swapcache(folio))))
return false;
@@ -4853,7 +4802,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int remaining = MAX_LRU_BATCH;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
VM_WARN_ON_ONCE(!list_empty(list));
@@ -4866,7 +4815,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
LIST_HEAD(moved);
int skipped = 0;
- struct list_head *head = &lrugen->lists[gen][type][zone];
+ struct list_head *head = &lrugen->folios[gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
@@ -4912,9 +4861,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_vm_events(PGSCAN_ANON + type, isolated);
/*
- * There might not be eligible pages due to reclaim_idx, may_unmap and
- * may_writepage. Check the remaining to prevent livelock if it's not
- * making progress.
+ * There might not be eligible folios due to reclaim_idx. Check the
+ * remaining to prevent livelock if it's not making progress.
*/
return isolated || !remaining ? scanned : 0;
}
@@ -5009,8 +4957,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
- bool *need_swapping)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -5099,162 +5046,399 @@ retry:
goto retry;
}
- if (need_swapping && type == LRU_GEN_ANON)
- *need_swapping = true;
-
return scanned;
}
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+ int gen, type, zone;
+ unsigned long old = 0;
+ unsigned long young = 0;
+ unsigned long total = 0;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ /* whether this lruvec is completely out of cold folios */
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
+ *nr_to_scan = 0;
+ return true;
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ unsigned long size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ total += size;
+ if (seq == max_seq)
+ young += size;
+ else if (seq + MIN_NR_GENS == max_seq)
+ old += size;
+ }
+ }
+
+ /* try to scrape all its memory if this memcg was deleted */
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+ /*
+ * The aging tries to be lazy to reduce the overhead, while the eviction
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ * ideal number of generations is MIN_NR_GENS+1.
+ */
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ return false;
+
+ /*
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+ * of the total number of pages for each generation. A reasonable range
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+ * aging cares about the upper bound of hot pages, while the eviction
+ * cares about the lower bound of cold pages.
+ */
+ if (young * MIN_NR_GENS > total)
+ return true;
+ if (old * (MIN_NR_GENS + 2) < total)
+ return true;
+
+ return false;
+}
+
/*
* For future optimizations:
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap, bool *need_aging)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
- if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
- (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
- !sc->memcg_low_reclaim))
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return 0;
- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- if (!*need_aging)
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
return nr_to_scan;
/* skip the aging path at the default priority */
if (sc->priority == DEF_PRIORITY)
- goto done;
+ return nr_to_scan;
- /* leave the work to lru_gen_age_node() */
- if (current_is_kswapd())
- return 0;
+ /* skip this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+}
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
- return nr_to_scan;
-done:
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+{
+ /* don't abort memcg reclaim to ensure fairness */
+ if (!global_reclaim(sc))
+ return -1;
+
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
}
-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
- struct scan_control *sc, bool need_swapping)
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- int i;
- DEFINE_MAX_SEQ(lruvec);
+ long nr_to_scan;
+ unsigned long scanned = 0;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ int swappiness = get_swappiness(lruvec, sc);
- if (!current_is_kswapd()) {
- /* age each memcg at most once to ensure fairness */
- if (max_seq - seq > 1)
- return true;
+ /* clean file folios are more likely to exist */
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
+ swappiness = 1;
- /* over-swapping can increase allocation latency */
- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
- return true;
+ while (true) {
+ int delta;
- /* give this thread a chance to exit and free its memory */
- if (fatal_signal_pending(current)) {
- sc->nr_reclaimed += MIN_LRU_BATCH;
- return true;
- }
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ if (nr_to_scan <= 0)
+ break;
- if (cgroup_reclaim(sc))
- return false;
- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
- return false;
+ delta = evict_folios(lruvec, sc, swappiness);
+ if (!delta)
+ break;
- /* keep scanning at low priorities to ensure fairness */
- if (sc->priority > DEF_PRIORITY - 2)
- return false;
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
- /*
- * A minimum amount of work was done under global memory pressure. For
- * kswapd, it may be overshooting. For direct reclaim, the allocation
- * may succeed if all suitable zones are somewhat safe. In either case,
- * it's better to stop now, and restart later if necessary.
- */
- for (i = 0; i <= sc->reclaim_idx; i++) {
- unsigned long wmark;
- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ break;
- if (!managed_zone(zone))
+ cond_resched();
+ }
+
+ /* whether try_to_inc_max_seq() was successful */
+ return nr_to_scan < 0;
+}
+
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+{
+ bool success;
+ unsigned long scanned = sc->nr_scanned;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ int seg = lru_gen_memcg_seg(lruvec);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (!lruvec_is_sizable(lruvec, sc))
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ if (mem_cgroup_below_min(NULL, memcg))
+ return MEMCG_LRU_YOUNG;
+
+ if (mem_cgroup_below_low(NULL, memcg)) {
+ /* see the comment on MEMCG_NR_GENS */
+ if (seg != MEMCG_LRU_TAIL)
+ return MEMCG_LRU_TAIL;
+
+ memcg_memory_event(memcg, MEMCG_LOW);
+ }
+
+ success = try_to_shrink_lruvec(lruvec, sc);
+
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
+
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
+ current->reclaim_state->reclaimed_slab = 0;
+
+ return success ? MEMCG_LRU_YOUNG : 0;
+}
+
+#ifdef CONFIG_MEMCG
+
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int gen;
+ int bin;
+ int first_bin;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ const struct hlist_nulls_node *pos;
+ int op = 0;
+ struct mem_cgroup *memcg = NULL;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+
+ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
+restart:
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
+
+ rcu_read_lock();
+
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
+
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
+ memcg = lruvec_memcg(lruvec);
+
+ if (!mem_cgroup_tryget(memcg)) {
+ op = 0;
+ memcg = NULL;
continue;
+ }
- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
- if (wmark > zone_page_state(zone, NR_FREE_PAGES))
- return false;
+ rcu_read_unlock();
+
+ op = shrink_one(lruvec, sc);
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ goto success;
+
+ rcu_read_lock();
}
- sc->nr_reclaimed += MIN_LRU_BATCH;
+ rcu_read_unlock();
- return true;
+ /* restart if raced with lru_gen_rotate_memcg() */
+ if (gen != get_nulls_value(pos))
+ goto restart;
+
+ /* try the rest of the bins of the current generation */
+ bin = get_memcg_bin(bin + 1);
+ if (bin != first_bin)
+ goto restart;
+success:
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
- bool need_aging = false;
- bool need_swapping = false;
- unsigned long scanned = 0;
- unsigned long reclaimed = sc->nr_reclaimed;
- DEFINE_MAX_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(global_reclaim(sc));
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
lru_add_drain();
blk_start_plug(&plug);
- set_mm_walk(lruvec_pgdat(lruvec));
+ set_mm_walk(NULL, sc->proactive);
- while (true) {
- int delta;
- int swappiness;
- unsigned long nr_to_scan;
+ if (try_to_shrink_lruvec(lruvec, sc))
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
- if (sc->may_swap)
- swappiness = get_swappiness(lruvec, sc);
- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
- swappiness = 1;
- else
- swappiness = 0;
+ clear_mm_walk();
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
- if (!nr_to_scan)
- goto done;
+ blk_finish_plug(&plug);
+}
- delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
- if (!delta)
- goto done;
+#else /* !CONFIG_MEMCG */
- scanned += delta;
- if (scanned >= nr_to_scan)
- break;
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
- if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
- break;
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
- cond_resched();
- }
+#endif
+
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_swappiness(lruvec, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ reclaimable /= MEMCG_NR_GENS;
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
+}
+
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ unsigned long reclaimed = sc->nr_reclaimed;
+
+ VM_WARN_ON_ONCE(!global_reclaim(sc));
+
+ /*
+ * Unmapped clean folios are already prioritized. Scanning for more of
+ * them is likely futile and can cause high reclaim latency when there
+ * is a large number of memcgs.
+ */
+ if (!sc->may_writepage || !sc->may_unmap)
+ goto done;
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(pgdat, sc->proactive);
+
+ set_initial_priority(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed = 0;
+
+ if (mem_cgroup_disabled())
+ shrink_one(&pgdat->__lruvec, sc);
+ else
+ shrink_many(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed += reclaimed;
- /* see the comment in lru_gen_age_node() */
- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
- sc->memcgs_need_aging = false;
-done:
clear_mm_walk();
blk_finish_plug(&plug);
+done:
+ /* kswapd should never fail */
+ pgdat->kswapd_failures = 0;
}
+#ifdef CONFIG_MEMCG
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
+{
+ int seg;
+ int old, new;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ seg = 0;
+ new = old = lruvec->lrugen.gen;
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (op == MEMCG_LRU_HEAD)
+ seg = MEMCG_LRU_HEAD;
+ else if (op == MEMCG_LRU_TAIL)
+ seg = MEMCG_LRU_TAIL;
+ else if (op == MEMCG_LRU_OLD)
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
+ else if (op == MEMCG_LRU_YOUNG)
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
+ else
+ VM_WARN_ON_ONCE(true);
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+ else
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+
+ pgdat->memcg_lru.nr_memcgs[old]--;
+ pgdat->memcg_lru.nr_memcgs[new]++;
+
+ lruvec->lrugen.gen = new;
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
+
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+}
+#endif
+
/******************************************************************************
* state change
******************************************************************************/
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
{
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
if (lrugen->enabled) {
enum lru_list lru;
@@ -5267,7 +5451,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
int gen, type, zone;
for_each_gen_type_zone(gen, type, zone) {
- if (!list_empty(&lrugen->lists[gen][type][zone]))
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
return false;
}
}
@@ -5312,7 +5496,7 @@ static bool drain_evictable(struct lruvec *lruvec)
int remaining = MAX_LRU_BATCH;
for_each_gen_type_zone(gen, type, zone) {
- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+ struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
while (!list_empty(head)) {
bool success;
@@ -5533,7 +5717,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
int i;
int type, tier;
int hist = lru_hist_from_seq(seq);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
@@ -5583,7 +5767,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
unsigned long seq;
bool full = !debugfs_real_fops(m->file)->write;
struct lruvec *lruvec = v;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -5680,7 +5864,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness, NULL))
+ if (!evict_folios(lruvec, sc, swappiness))
return 0;
cond_resched();
@@ -5701,11 +5885,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (!mem_cgroup_disabled()) {
rcu_read_lock();
+
memcg = mem_cgroup_from_id(memcg_id);
-#ifdef CONFIG_MEMCG
- if (memcg && !css_tryget(&memcg->css))
+ if (!mem_cgroup_tryget(memcg))
memcg = NULL;
-#endif
+
rcu_read_unlock();
if (!memcg)
@@ -5765,7 +5949,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
set_task_reclaim_state(current, &sc.reclaim_state);
flags = memalloc_noreclaim_save();
blk_start_plug(&plug);
- if (!set_mm_walk(NULL)) {
+ if (!set_mm_walk(NULL, true)) {
err = -ENOMEM;
goto done;
}
@@ -5837,7 +6021,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
{
int i;
int gen, type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
@@ -5846,13 +6030,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
lrugen->timestamps[i] = jiffies;
for_each_gen_type_zone(gen, type, zone)
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+ INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
lruvec->mm_state.seq = MIN_NR_GENS;
init_waitqueue_head(&lruvec->mm_state.wait);
}
#ifdef CONFIG_MEMCG
+
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+ int i, j;
+
+ spin_lock_init(&pgdat->memcg_lru.lock);
+
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
+ for (j = 0; j < MEMCG_NR_BINS; j++)
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ }
+}
+
void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
INIT_LIST_HEAD(&memcg->mm_list.fifo);
@@ -5876,7 +6073,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
}
}
}
-#endif
+
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
+
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
+ pgdat->memcg_lru.nr_memcgs[gen]++;
+
+ lruvec->lrugen.gen = gen;
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+ }
+}
+
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
+ }
+}
+
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = lruvec->lrugen.gen;
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+ pgdat->memcg_lru.nr_memcgs[gen]--;
+
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+ }
+}
+
+#endif /* CONFIG_MEMCG */
static int __init init_lru_gen(void)
{
@@ -5903,6 +6162,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
{
}
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -5916,7 +6179,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled()) {
+ if (lru_gen_enabled() && !global_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
return;
}
@@ -6159,6 +6422,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec;
bool reclaimable = false;
+ if (lru_gen_enabled() && global_reclaim(sc)) {
+ lru_gen_shrink_node(pgdat, sc);
+ return;
+ }
+
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
diff --git a/mm/workingset.c b/mm/workingset.c
index 1a86645b7b3c..f194d13beabb 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
int type = folio_is_file_lru(folio);
@@ -457,6 +457,7 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
@@ -474,7 +475,7 @@ void workingset_refault(struct folio *folio, void *shadow)
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
diff --git a/mm/z3fold.c b/mm/z3fold.c
index a4de0c317ac7..0cef845d397b 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
struct z3fold_header *zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
if (test_bit(PAGE_HEADLESS, &page->private))
@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
struct z3fold_header *zhdr, *new_zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 702bc3fd687a..9d27d9b00bce 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
* Page is locked so zspage couldn't be destroyed. For detail, look at
* lock_zspage in free_zspage.
*/
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
@@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
if (mode == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
/* The page is locked, so this pointer must remain valid */
@@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page)
{
struct zspage *zspage;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c567d5e8053e..f713c0422f0f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2092,7 +2092,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
maybe_zap_len = total_bytes_to_map - /* All bytes to map */
*length + /* Mapped or pending */
(pages_remaining * PAGE_SIZE); /* Failed map. */
- zap_page_range(vma, *address, maybe_zap_len);
+ zap_page_range_single(vma, *address, maybe_zap_len, NULL);
err = 0;
}
@@ -2100,7 +2100,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
unsigned long leftover_pages = pages_remaining;
int bytes_mapped;
- /* We called zap_page_range, try to reinsert. */
+ /* We called zap_page_range_single, try to reinsert. */
err = vm_insert_pages(vma, *address,
pending_pages,
&pages_remaining);
@@ -2234,7 +2234,8 @@ static int tcp_zerocopy_receive(struct sock *sk,
total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
if (total_bytes_to_map) {
if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
- zap_page_range(vma, address, total_bytes_to_map);
+ zap_page_range_single(vma, address, total_bytes_to_map,
+ NULL);
zc->length = total_bytes_to_map;
zc->recv_skip_hint = 0;
} else {
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 66915653108c..5e9949832af6 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -161,15 +161,6 @@ VISIBLE_IF_KUNIT bool aa_inbounds(struct aa_ext *e, size_t size)
}
EXPORT_SYMBOL_IF_KUNIT(aa_inbounds);
-static void *kvmemdup(const void *src, size_t len)
-{
- void *p = kvmalloc(len, GFP_KERNEL);
-
- if (p)
- memcpy(p, src, len);
- return p;
-}
-
/**
* aa_unpack_u16_chunk - test and do bounds checking for a u16 size based chunk
* @e: serialized data read head (NOT NULL)
@@ -1027,7 +1018,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
data->key = key;
data->size = aa_unpack_blob(e, &data->data, NULL);
- data->data = kvmemdup(data->data, data->size);
+ data->data = kvmemdup(data->data, data->size, GFP_KERNEL);
if (data->size && !data->data) {
kfree_sensitive(data->key);
kfree_sensitive(data);
diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py
index 706ab27666a4..e81c3017ada9 100644
--- a/tools/cgroup/memcg_shrinker.py
+++ b/tools/cgroup/memcg_shrinker.py
@@ -5,7 +5,6 @@
import os
import argparse
-import sys
def scan_cgroups(cgroup_root):
@@ -44,7 +43,7 @@ def main():
cgroups = scan_cgroups("/sys/fs/cgroup/")
shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/")
- shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0])
+ shrinkers.sort(reverse = True, key = lambda x: x[0])
n = 0
for s in shrinkers:
diff --git a/tools/vm/.gitignore b/tools/mm/.gitignore
index 922879f93fc8..922879f93fc8 100644
--- a/tools/vm/.gitignore
+++ b/tools/mm/.gitignore
diff --git a/tools/vm/Makefile b/tools/mm/Makefile
index 9860622cbb15..9860622cbb15 100644
--- a/tools/vm/Makefile
+++ b/tools/mm/Makefile
diff --git a/tools/vm/page-types.c b/tools/mm/page-types.c
index 381dcc00cb62..381dcc00cb62 100644
--- a/tools/vm/page-types.c
+++ b/tools/mm/page-types.c
diff --git a/tools/vm/page_owner_sort.c b/tools/mm/page_owner_sort.c
index ce860ab94162..7c2ac124cdc8 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/mm/page_owner_sort.c
@@ -246,15 +246,16 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
return 0;
}
-static void check_regcomp(regex_t *pattern, const char *regex)
+static bool check_regcomp(regex_t *pattern, const char *regex)
{
int err;
err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
if (err != 0 || pattern->re_nsub != 1) {
fprintf(stderr, "Invalid pattern %s code %d\n", regex, err);
- exit(1);
+ return false;
}
+ return true;
}
static char **explode(char sep, const char *str, int *size)
@@ -494,28 +495,28 @@ static bool is_need(char *buf)
return true;
}
-static void add_list(char *buf, int len, char *ext_buf)
+static bool add_list(char *buf, int len, char *ext_buf)
{
if (list_size != 0 &&
len == list[list_size-1].len &&
memcmp(buf, list[list_size-1].txt, len) == 0) {
list[list_size-1].num++;
list[list_size-1].page_num += get_page_num(buf);
- return;
+ return true;
}
if (list_size == max_size) {
fprintf(stderr, "max_size too small??\n");
- exit(1);
+ return false;
}
if (!is_need(buf))
- return;
+ return true;
list[list_size].pid = get_pid(buf);
list[list_size].tgid = get_tgid(buf);
list[list_size].comm = get_comm(buf);
list[list_size].txt = malloc(len+1);
if (!list[list_size].txt) {
fprintf(stderr, "Out of memory\n");
- exit(1);
+ return false;
}
memcpy(list[list_size].txt, buf, len);
list[list_size].txt[len] = 0;
@@ -534,6 +535,7 @@ static void add_list(char *buf, int len, char *ext_buf)
printf("loaded %d\r", list_size);
fflush(stdout);
}
+ return true;
}
static bool parse_cull_args(const char *arg_str)
@@ -790,12 +792,19 @@ int main(int argc, char **argv)
exit(1);
}
- check_regcomp(&order_pattern, "order\\s*([0-9]*),");
- check_regcomp(&pid_pattern, "pid\\s*([0-9]*),");
- check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ");
- check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts");
- check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,");
- check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns");
+ if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),"))
+ goto out_order;
+ if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"))
+ goto out_pid;
+ if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "))
+ goto out_tgid;
+ if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"))
+ goto out_comm;
+ if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"))
+ goto out_ts;
+ if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"))
+ goto out_free_ts;
+
fstat(fileno(fin), &st);
max_size = st.st_size / 100; /* hack ... */
@@ -804,7 +813,7 @@ int main(int argc, char **argv)
ext_buf = malloc(BUF_SIZE);
if (!list || !buf || !ext_buf) {
fprintf(stderr, "Out of memory\n");
- exit(1);
+ goto out_free;
}
for ( ; ; ) {
@@ -812,7 +821,8 @@ int main(int argc, char **argv)
if (buf_len < 0)
break;
- add_list(buf, buf_len, ext_buf);
+ if (!add_list(buf, buf_len, ext_buf))
+ goto out_free;
}
printf("loaded %d\n", list_size);
@@ -862,11 +872,26 @@ int main(int argc, char **argv)
fprintf(fout, "\n");
}
}
- regfree(&order_pattern);
- regfree(&pid_pattern);
- regfree(&tgid_pattern);
- regfree(&comm_pattern);
- regfree(&ts_nsec_pattern);
+
+out_free:
+ if (ext_buf)
+ free(ext_buf);
+ if (buf)
+ free(buf);
+ if (list)
+ free(list);
+out_free_ts:
regfree(&free_ts_nsec_pattern);
+out_ts:
+ regfree(&ts_nsec_pattern);
+out_comm:
+ regfree(&comm_pattern);
+out_tgid:
+ regfree(&tgid_pattern);
+out_pid:
+ regfree(&pid_pattern);
+out_order:
+ regfree(&order_pattern);
+
return 0;
}
diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/mm/slabinfo-gnuplot.sh
index 873a892147e5..873a892147e5 100644
--- a/tools/vm/slabinfo-gnuplot.sh
+++ b/tools/mm/slabinfo-gnuplot.sh
diff --git a/tools/vm/slabinfo.c b/tools/mm/slabinfo.c
index cfaeaea71042..cfaeaea71042 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/mm/slabinfo.c
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 81fa7ec2e66a..1f36bc1c5d36 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt)
if (!MAPLE_32BIT) {
if (i >= 35)
- e = i - 35;
+ e = i - 34;
else if (i >= 5)
- e = i - 5;
+ e = i - 4;
else if (i >= 2)
- e = i - 2;
+ e = i - 1;
} else {
if (i >= 4)
e = i - 4;
@@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
mn = mas_pop_node(&mas); /* get the next node. */
MT_BUG_ON(mt, mn == NULL);
MT_BUG_ON(mt, not_empty(mn));
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS);
- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2);
+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
mas_push_node(&mas, mn);
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
/* Check the limit of pop/push/pop */
mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */
@@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
MT_BUG_ON(mt, mas_alloc_req(&mas));
- MT_BUG_ON(mt, mas.alloc->node_count);
+ MT_BUG_ON(mt, mas.alloc->node_count != 1);
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
mas_push_node(&mas, mn);
- MT_BUG_ON(mt, mas.alloc->node_count);
+ MT_BUG_ON(mt, mas.alloc->node_count != 1);
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 41b649452560..56a29f2de8e6 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -85,7 +85,7 @@ TARGETS += tmpfs
TARGETS += tpm2
TARGETS += user
TARGETS += vDSO
-TARGETS += vm
+TARGETS += mm
TARGETS += x86
TARGETS += zram
#Please keep the TARGETS list alphabetically sorted
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index db4942383a50..a00336ffdcad 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -96,6 +96,34 @@ test_stats()
done
}
+test_filter()
+{
+ filter_dir=$1
+ ensure_file "$filter_dir/type" "exist" "600"
+ ensure_write_succ "$filter_dir/type" "anon" "valid input"
+ ensure_write_succ "$filter_dir/type" "memcg" "valid input"
+ ensure_write_fail "$filter_dir/type" "foo" "invalid input"
+ ensure_file "$filter_dir/matching" "exist" "600"
+ ensure_file "$filter_dir/memcg_path" "exist" "600"
+}
+
+test_filters()
+{
+ filters_dir=$1
+ ensure_dir "$filters_dir" "exist"
+ ensure_file "$filters_dir/nr_filters" "exist" "600"
+ ensure_write_succ "$filters_dir/nr_filters" "1" "valid input"
+ test_filter "$filters_dir/0"
+
+ ensure_write_succ "$filters_dir/nr_filters" "2" "valid input"
+ test_filter "$filters_dir/0"
+ test_filter "$filters_dir/1"
+
+ ensure_write_succ "$filters_dir/nr_filters" "0" "valid input"
+ ensure_dir "$filters_dir/0" "not_exist"
+ ensure_dir "$filters_dir/1" "not_exist"
+}
+
test_watermarks()
{
watermarks_dir=$1
@@ -143,6 +171,7 @@ test_scheme()
test_access_pattern "$scheme_dir/access_pattern"
test_quotas "$scheme_dir/quotas"
test_watermarks "$scheme_dir/watermarks"
+ test_filters "$scheme_dir/filters"
test_stats "$scheme_dir/stats"
test_tried_regions "$scheme_dir/tried_regions"
}
diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh
index 7424a1f5babc..4bc14d9e8ff1 100755
--- a/tools/testing/selftests/kselftest_deps.sh
+++ b/tools/testing/selftests/kselftest_deps.sh
@@ -12,9 +12,9 @@ usage()
echo -e "Usage: $0 -[p] <compiler> [test_name]\n"
echo -e "\tkselftest_deps.sh [-p] gcc"
-echo -e "\tkselftest_deps.sh [-p] gcc vm"
+echo -e "\tkselftest_deps.sh [-p] gcc mm"
echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc"
-echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n"
+echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc mm\n"
echo "- Should be run in selftests directory in the kernel repo."
echo "- Checks if Kselftests can be built/cross-built on a system."
echo "- Parses all test/sub-test Makefile to find library dependencies."
@@ -120,7 +120,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \
# Level 2
# Some tests have multiple valid LDLIBS lines for individual sub-tests
# that need dependency checks. Find them and append them to the tests
-# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+# e.g: mm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
# Filter out VAR_LDLIBS to discard the following:
# memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
# Append space at the end of the list to append more tests.
diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c
index be675002f918..93798c8c5d54 100644
--- a/tools/testing/selftests/memfd/fuse_test.c
+++ b/tools/testing/selftests/memfd/fuse_test.c
@@ -22,6 +22,7 @@
#include <linux/falloc.h>
#include <fcntl.h>
#include <linux/memfd.h>
+#include <linux/types.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 94df2692e6e4..ae71f15f790d 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -28,12 +28,46 @@
#define MFD_DEF_SIZE 8192
#define STACK_SIZE 65536
+#define F_SEAL_EXEC 0x0020
+
+#define F_WX_SEALS (F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE | \
+ F_SEAL_FUTURE_WRITE | \
+ F_SEAL_EXEC)
+
+#define MFD_NOEXEC_SEAL 0x0008U
+
/*
* Default is not to test hugetlbfs
*/
static size_t mfd_def_size = MFD_DEF_SIZE;
static const char *memfd_str = MEMFD_STR;
+static ssize_t fd2name(int fd, char *buf, size_t bufsize)
+{
+ char buf1[PATH_MAX];
+ int size;
+ ssize_t nbytes;
+
+ size = snprintf(buf1, PATH_MAX, "/proc/self/fd/%d", fd);
+ if (size < 0) {
+ printf("snprintf(%d) failed on %m\n", fd);
+ abort();
+ }
+
+ /*
+ * reserver one byte for string termination.
+ */
+ nbytes = readlink(buf1, buf, bufsize-1);
+ if (nbytes == -1) {
+ printf("readlink(%s) failed %m\n", buf1);
+ abort();
+ }
+ buf[nbytes] = '\0';
+ return nbytes;
+}
+
static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
{
int r, fd;
@@ -54,6 +88,37 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
return fd;
}
+static void sysctl_assert_write(const char *val)
+{
+ int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC);
+
+ if (fd < 0) {
+ printf("open sysctl failed\n");
+ abort();
+ }
+
+ if (write(fd, val, strlen(val)) < 0) {
+ printf("write sysctl failed\n");
+ abort();
+ }
+}
+
+static void sysctl_fail_write(const char *val)
+{
+ int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC);
+
+ if (fd < 0) {
+ printf("open sysctl failed\n");
+ abort();
+ }
+
+ if (write(fd, val, strlen(val)) >= 0) {
+ printf("write sysctl %s succeeded, but failure expected\n",
+ val);
+ abort();
+ }
+}
+
static int mfd_assert_reopen_fd(int fd_in)
{
int fd;
@@ -98,11 +163,14 @@ static unsigned int mfd_assert_get_seals(int fd)
static void mfd_assert_has_seals(int fd, unsigned int seals)
{
+ char buf[PATH_MAX];
+ int nbytes;
unsigned int s;
+ fd2name(fd, buf, PATH_MAX);
s = mfd_assert_get_seals(fd);
if (s != seals) {
- printf("%u != %u = GET_SEALS(%d)\n", seals, s, fd);
+ printf("%u != %u = GET_SEALS(%s)\n", seals, s, buf);
abort();
}
}
@@ -594,6 +662,64 @@ static void mfd_fail_grow_write(int fd)
}
}
+static void mfd_assert_mode(int fd, int mode)
+{
+ struct stat st;
+ char buf[PATH_MAX];
+ int nbytes;
+
+ fd2name(fd, buf, PATH_MAX);
+
+ if (fstat(fd, &st) < 0) {
+ printf("fstat(%s) failed: %m\n", buf);
+ abort();
+ }
+
+ if ((st.st_mode & 07777) != mode) {
+ printf("fstat(%s) wrong file mode 0%04o, but expected 0%04o\n",
+ buf, (int)st.st_mode & 07777, mode);
+ abort();
+ }
+}
+
+static void mfd_assert_chmod(int fd, int mode)
+{
+ char buf[PATH_MAX];
+ int nbytes;
+
+ fd2name(fd, buf, PATH_MAX);
+
+ if (fchmod(fd, mode) < 0) {
+ printf("fchmod(%s, 0%04o) failed: %m\n", buf, mode);
+ abort();
+ }
+
+ mfd_assert_mode(fd, mode);
+}
+
+static void mfd_fail_chmod(int fd, int mode)
+{
+ struct stat st;
+ char buf[PATH_MAX];
+ int nbytes;
+
+ fd2name(fd, buf, PATH_MAX);
+
+ if (fstat(fd, &st) < 0) {
+ printf("fstat(%s) failed: %m\n", buf);
+ abort();
+ }
+
+ if (fchmod(fd, mode) == 0) {
+ printf("fchmod(%s, 0%04o) didn't fail as expected\n",
+ buf, mode);
+ abort();
+ }
+
+ /* verify that file mode bits did not change */
+ mfd_assert_mode(fd, st.st_mode & 07777);
+}
+
static int idle_thread_fn(void *arg)
{
sigset_t set;
@@ -671,6 +797,9 @@ static void test_create(void)
mfd_fail_new("", ~0);
mfd_fail_new("", 0x80000000U);
+ /* verify EXEC and NOEXEC_SEAL can't both be set */
+ mfd_fail_new("", MFD_EXEC | MFD_NOEXEC_SEAL);
+
/* verify MFD_CLOEXEC is allowed */
fd = mfd_assert_new("", 0, MFD_CLOEXEC);
close(fd);
@@ -881,6 +1010,211 @@ static void test_seal_resize(void)
}
/*
+ * Test SEAL_EXEC
+ * Test fd is created with exec and allow sealing.
+ * chmod() cannot change x bits after sealing.
+ */
+static void test_exec_seal(void)
+{
+ int fd;
+
+ printf("%s SEAL-EXEC\n", memfd_str);
+
+ printf("%s Apply SEAL_EXEC\n", memfd_str);
+ fd = mfd_assert_new("kern_memfd_seal_exec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC);
+
+ mfd_assert_mode(fd, 0777);
+ mfd_assert_chmod(fd, 0644);
+
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_EXEC);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+
+ mfd_assert_chmod(fd, 0600);
+ mfd_fail_chmod(fd, 0777);
+ mfd_fail_chmod(fd, 0670);
+ mfd_fail_chmod(fd, 0605);
+ mfd_fail_chmod(fd, 0700);
+ mfd_fail_chmod(fd, 0100);
+ mfd_assert_chmod(fd, 0666);
+ mfd_assert_write(fd);
+ close(fd);
+
+ printf("%s Apply ALL_SEALS\n", memfd_str);
+ fd = mfd_assert_new("kern_memfd_seal_exec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC);
+
+ mfd_assert_mode(fd, 0777);
+ mfd_assert_chmod(fd, 0700);
+
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_EXEC);
+ mfd_assert_has_seals(fd, F_WX_SEALS);
+
+ mfd_fail_chmod(fd, 0711);
+ mfd_fail_chmod(fd, 0600);
+ mfd_fail_write(fd);
+ close(fd);
+}
+
+/*
+ * Test EXEC_NO_SEAL
+ * Test fd is created with exec and not allow sealing.
+ */
+static void test_exec_no_seal(void)
+{
+ int fd;
+
+ printf("%s EXEC_NO_SEAL\n", memfd_str);
+
+ /* Create with EXEC but without ALLOW_SEALING */
+ fd = mfd_assert_new("kern_memfd_exec_no_sealing",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_EXEC);
+ mfd_assert_mode(fd, 0777);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+ mfd_assert_chmod(fd, 0666);
+ close(fd);
+}
+
+/*
+ * Test memfd_create with MFD_NOEXEC flag
+ */
+static void test_noexec_seal(void)
+{
+ int fd;
+
+ printf("%s NOEXEC_SEAL\n", memfd_str);
+
+ /* Create with NOEXEC and ALLOW_SEALING */
+ fd = mfd_assert_new("kern_memfd_noexec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_NOEXEC_SEAL);
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
+
+ /* Create with NOEXEC but without ALLOW_SEALING */
+ fd = mfd_assert_new("kern_memfd_noexec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
+}
+
+static void test_sysctl_child(void)
+{
+ int fd;
+
+ printf("%s sysctl 0\n", memfd_str);
+ sysctl_assert_write("0");
+ fd = mfd_assert_new("kern_memfd_sysctl_0",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ mfd_assert_mode(fd, 0777);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_chmod(fd, 0644);
+ close(fd);
+
+ printf("%s sysctl 1\n", memfd_str);
+ sysctl_assert_write("1");
+ fd = mfd_assert_new("kern_memfd_sysctl_1",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_fail_chmod(fd, 0777);
+ sysctl_fail_write("0");
+ close(fd);
+
+ printf("%s sysctl 2\n", memfd_str);
+ sysctl_assert_write("2");
+ mfd_fail_new("kern_memfd_sysctl_2",
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ sysctl_fail_write("0");
+ sysctl_fail_write("1");
+}
+
+static int newpid_thread_fn(void *arg)
+{
+ test_sysctl_child();
+ return 0;
+}
+
+static void test_sysctl_child2(void)
+{
+ int fd;
+
+ sysctl_fail_write("0");
+ fd = mfd_assert_new("kern_memfd_sysctl_1",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
+}
+
+static int newpid_thread_fn2(void *arg)
+{
+ test_sysctl_child2();
+ return 0;
+}
+static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *))
+{
+ uint8_t *stack;
+ pid_t pid;
+
+ stack = malloc(STACK_SIZE);
+ if (!stack) {
+ printf("malloc(STACK_SIZE) failed: %m\n");
+ abort();
+ }
+
+ pid = clone(fn,
+ stack + STACK_SIZE,
+ SIGCHLD | flags,
+ NULL);
+ if (pid < 0) {
+ printf("clone() failed: %m\n");
+ abort();
+ }
+
+ return pid;
+}
+
+static void join_newpid_thread(pid_t pid)
+{
+ waitpid(pid, NULL, 0);
+}
+
+/*
+ * Test sysctl
+ * A very basic sealing test to see whether setting/retrieving seals works.
+ */
+static void test_sysctl(void)
+{
+ int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn);
+
+ join_newpid_thread(pid);
+
+ printf("%s child ns\n", memfd_str);
+ sysctl_assert_write("1");
+
+ pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2);
+ join_newpid_thread(pid);
+}
+
+/*
* Test sharing via dup()
* Test that seals are shared between dupped FDs and they're all equal.
*/
@@ -1053,6 +1387,9 @@ int main(int argc, char **argv)
test_create();
test_basic();
+ test_exec_seal();
+ test_exec_no_seal();
+ test_noexec_seal();
test_seal_write();
test_seal_future_write();
@@ -1074,6 +1411,8 @@ int main(int argc, char **argv)
test_share_fork("SHARE-FORK", SHARED_FT_STR);
join_idle_thread(pid);
+ test_sysctl();
+
printf("memfd: DONE\n");
return 0;
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 1f8c36a9fa10..1f8c36a9fa10 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/mm/Makefile
index 89c14e41bd43..0a44d77f8437 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -1,11 +1,15 @@
# SPDX-License-Identifier: GPL-2.0
-# Makefile for vm selftests
+# Makefile for mm selftests
-LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
+LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
include local_config.mk
+ifeq ($(CROSS_COMPILE),)
uname_M := $(shell uname -m 2>/dev/null || echo not)
+else
+uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+')
+endif
MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
# Without this, failed build products remain, with up-to-date timestamps,
diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index a5cb4b09a46c..a5cb4b09a46c 100644
--- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
index bcba3af0acea..bcba3af0acea 100644
--- a/tools/testing/selftests/vm/check_config.sh
+++ b/tools/testing/selftests/mm/check_config.sh
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 9b420140ba2b..9b420140ba2b 100644
--- a/tools/testing/selftests/vm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/mm/config
index be087c4bc396..be087c4bc396 100644
--- a/tools/testing/selftests/vm/config
+++ b/tools/testing/selftests/mm/config
diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/mm/cow.c
index 26f6ea3079e2..0eb2e8180aa5 100644
--- a/tools/testing/selftests/vm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -30,6 +30,13 @@
#include "../kselftest.h"
#include "vm_util.h"
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
static size_t pagesize;
static int pagemap_fd;
static size_t thpsize;
@@ -1178,6 +1185,228 @@ static int tests_per_anon_test_case(void)
return tests;
}
+enum anon_thp_collapse_test {
+ ANON_THP_COLLAPSE_UNSHARED,
+ ANON_THP_COLLAPSE_FULLY_SHARED,
+ ANON_THP_COLLAPSE_LOWER_SHARED,
+ ANON_THP_COLLAPSE_UPPER_SHARED,
+};
+
+static void do_test_anon_thp_collapse(char *mem, size_t size,
+ enum anon_thp_collapse_test test)
+{
+ struct comm_pipes comm_pipes;
+ char buf;
+ int ret;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ return;
+ }
+
+ /*
+ * Trigger PTE-mapping the THP by temporarily mapping a single subpage
+ * R/O, such that we can try collapsing it later.
+ */
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto close_comm_pipes;
+ }
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto close_comm_pipes;
+ }
+
+ switch (test) {
+ case ANON_THP_COLLAPSE_UNSHARED:
+ /* Collapse before actually COW-sharing the page. */
+ ret = madvise(mem, size, MADV_COLLAPSE);
+ if (ret) {
+ ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+ strerror(errno));
+ goto close_comm_pipes;
+ }
+ break;
+ case ANON_THP_COLLAPSE_FULLY_SHARED:
+ /* COW-share the full PTE-mapped THP. */
+ break;
+ case ANON_THP_COLLAPSE_LOWER_SHARED:
+ /* Don't COW-share the upper part of the THP. */
+ ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DONTFORK failed\n");
+ goto close_comm_pipes;
+ }
+ break;
+ case ANON_THP_COLLAPSE_UPPER_SHARED:
+ /* Don't COW-share the lower part of the THP. */
+ ret = madvise(mem, size / 2, MADV_DONTFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DONTFORK failed\n");
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_comm_pipes;
+ } else if (!ret) {
+ switch (test) {
+ case ANON_THP_COLLAPSE_UNSHARED:
+ case ANON_THP_COLLAPSE_FULLY_SHARED:
+ exit(child_memcmp_fn(mem, size, &comm_pipes));
+ break;
+ case ANON_THP_COLLAPSE_LOWER_SHARED:
+ exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
+ break;
+ case ANON_THP_COLLAPSE_UPPER_SHARED:
+ exit(child_memcmp_fn(mem + size / 2, size / 2,
+ &comm_pipes));
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+
+ switch (test) {
+ case ANON_THP_COLLAPSE_UNSHARED:
+ break;
+ case ANON_THP_COLLAPSE_UPPER_SHARED:
+ case ANON_THP_COLLAPSE_LOWER_SHARED:
+ /*
+ * Revert MADV_DONTFORK such that we merge the VMAs and are
+ * able to actually collapse.
+ */
+ ret = madvise(mem, size, MADV_DOFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DOFORK failed\n");
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+ /* FALLTHROUGH */
+ case ANON_THP_COLLAPSE_FULLY_SHARED:
+ /* Collapse before anyone modified the COW-shared page. */
+ ret = madvise(mem, size, MADV_COLLAPSE);
+ if (ret) {
+ ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+ strerror(errno));
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+
+ ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_anon_thp_collapse_unshared(char *mem, size_t size)
+{
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
+}
+
+static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
+{
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
+}
+
+static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
+{
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
+}
+
+static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
+{
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
+}
+
+/*
+ * Test cases that are specific to anonymous THP: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_thp_test_cases[] = {
+ /*
+ * Basic COW test for fork() without any GUP when collapsing a THP
+ * before fork().
+ *
+ * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
+ * collapse") might easily get COW handling wrong when not collapsing
+ * exclusivity information properly.
+ */
+ {
+ "Basic COW after fork() when collapsing before fork()",
+ test_anon_thp_collapse_unshared,
+ },
+ /* Basic COW test, but collapse after COW-sharing a full THP. */
+ {
+ "Basic COW after fork() when collapsing after fork() (fully shared)",
+ test_anon_thp_collapse_fully_shared,
+ },
+ /*
+ * Basic COW test, but collapse after COW-sharing the lower half of a
+ * THP.
+ */
+ {
+ "Basic COW after fork() when collapsing after fork() (lower shared)",
+ test_anon_thp_collapse_lower_shared,
+ },
+ /*
+ * Basic COW test, but collapse after COW-sharing the upper half of a
+ * THP.
+ */
+ {
+ "Basic COW after fork() when collapsing after fork() (upper shared)",
+ test_anon_thp_collapse_upper_shared,
+ },
+};
+
+static void run_anon_thp_test_cases(void)
+{
+ int i;
+
+ if (!thpsize)
+ return;
+
+ ksft_print_msg("[INFO] Anonymous THP tests\n");
+
+ for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
+ struct test_case const *test_case = &anon_thp_test_cases[i];
+
+ ksft_print_msg("[RUN] %s\n", test_case->desc);
+ do_run_with_thp(test_case->fn, THP_RUN_PMD);
+ }
+}
+
+static int tests_per_anon_thp_test_case(void)
+{
+ return thpsize ? 1 : 0;
+}
+
typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
static void test_cow(char *mem, const char *smem, size_t size)
@@ -1518,6 +1747,7 @@ int main(int argc, char **argv)
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
+ ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
@@ -1526,6 +1756,7 @@ int main(int argc, char **argv)
ksft_exit_fail_msg("opening pagemap failed\n");
run_anon_test_cases();
+ run_anon_thp_test_cases();
run_non_anon_test_cases();
err = ksft_get_fail_cnt();
diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/mm/gup_test.c
index e43879291dac..e43879291dac 100644
--- a/tools/testing/selftests/vm/gup_test.c
+++ b/tools/testing/selftests/mm/gup_test.c
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
index 4adaad1b822f..4adaad1b822f 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/mm/hmm-tests.c
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c
index 955ef87f382c..955ef87f382c 100644
--- a/tools/testing/selftests/vm/hugepage-mmap.c
+++ b/tools/testing/selftests/mm/hugepage-mmap.c
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index e53b5eaa8fce..e53b5eaa8fce 100644
--- a/tools/testing/selftests/vm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c
index e2527f32005b..e2527f32005b 100644
--- a/tools/testing/selftests/vm/hugepage-shm.c
+++ b/tools/testing/selftests/mm/hugepage-shm.c
diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c
index 557bdbd4f87e..557bdbd4f87e 100644
--- a/tools/testing/selftests/vm/hugepage-vmemmap.c
+++ b/tools/testing/selftests/mm/hugepage-vmemmap.c
diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
index 9a127a8fe176..9a127a8fe176 100644
--- a/tools/testing/selftests/vm/hugetlb-madvise.c
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index bf2d2a684edf..bf2d2a684edf 100644
--- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
index 64126c8cd561..64126c8cd561 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/mm/khugepaged.c
diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index b11b7e5115dc..d8b5b4930412 100644
--- a/tools/testing/selftests/vm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -37,7 +37,7 @@ static bool range_maps_duplicates(char *addr, unsigned long size)
/*
* There is no easy way to check if there are KSM pages mapped into
* this range. We only check that the range does not map the same PFN
- * twice by comaring each pair of mapped pages.
+ * twice by comparing each pair of mapped pages.
*/
for (offs_a = 0; offs_a < size; offs_a += pagesize) {
pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index f9eb4d67e0dd..f9eb4d67e0dd 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
index 262eae6b58f2..262eae6b58f2 100644
--- a/tools/testing/selftests/vm/madv_populate.c
+++ b/tools/testing/selftests/mm/madv_populate.c
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
index eed44322d1a6..eed44322d1a6 100644
--- a/tools/testing/selftests/vm/map_fixed_noreplace.c
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
index 312889edb84a..312889edb84a 100644
--- a/tools/testing/selftests/vm/map_hugetlb.c
+++ b/tools/testing/selftests/mm/map_hugetlb.c
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
index 6b8aeaa0bf7a..6b8aeaa0bf7a 100644
--- a/tools/testing/selftests/vm/map_populate.c
+++ b/tools/testing/selftests/mm/map_populate.c
diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c
index 957b9e18c729..957b9e18c729 100644
--- a/tools/testing/selftests/vm/memfd_secret.c
+++ b/tools/testing/selftests/mm/memfd_secret.c
diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/mm/migration.c
index 1cec8425e3ca..1cec8425e3ca 100644
--- a/tools/testing/selftests/vm/migration.c
+++ b/tools/testing/selftests/mm/migration.c
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
index 782ea94dee2f..782ea94dee2f 100644
--- a/tools/testing/selftests/vm/mlock-random-test.c
+++ b/tools/testing/selftests/mm/mlock-random-test.c
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 11b2301f3aa3..11b2301f3aa3 100644
--- a/tools/testing/selftests/vm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
index 2a6e76c226bc..2a6e76c226bc 100644
--- a/tools/testing/selftests/vm/mlock2.h
+++ b/tools/testing/selftests/mm/mlock2.h
diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
index 6c62966ab5db..6c62966ab5db 100644
--- a/tools/testing/selftests/vm/mrelease_test.c
+++ b/tools/testing/selftests/mm/mrelease_test.c
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
index f01dc4a85b0b..f01dc4a85b0b 100644
--- a/tools/testing/selftests/vm/mremap_dontunmap.c
+++ b/tools/testing/selftests/mm/mremap_dontunmap.c
diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 9496346973d4..5c3773de9f0f 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -119,47 +119,109 @@ static unsigned long long get_mmap_min_addr(void)
}
/*
- * This test validates that merge is called when expanding a mapping.
- * Mapping containing three pages is created, middle page is unmapped
- * and then the mapping containing the first page is expanded so that
- * it fills the created hole. The two parts should merge creating
- * single mapping with three pages.
+ * Using /proc/self/maps, assert that the specified address range is contained
+ * within a single mapping.
*/
-static void mremap_expand_merge(unsigned long page_size)
+static bool is_range_mapped(FILE *maps_fp, void *start, void *end)
{
- char *test_name = "mremap expand merge";
- FILE *fp;
char *line = NULL;
size_t len = 0;
bool success = false;
- char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
- munmap(start + page_size, page_size);
- mremap(start, page_size, 2 * page_size, 0);
- fp = fopen("/proc/self/maps", "r");
- if (fp == NULL) {
- ksft_test_result_fail("%s\n", test_name);
- return;
- }
+ rewind(maps_fp);
- while (getline(&line, &len, fp) != -1) {
+ while (getline(&line, &len, maps_fp) != -1) {
char *first = strtok(line, "- ");
void *first_val = (void *)strtol(first, NULL, 16);
char *second = strtok(NULL, "- ");
void *second_val = (void *) strtol(second, NULL, 16);
- if (first_val == start && second_val == start + 3 * page_size) {
+ if (first_val <= start && second_val >= end) {
success = true;
break;
}
}
+
+ return success;
+}
+
+/*
+ * This test validates that merge is called when expanding a mapping.
+ * Mapping containing three pages is created, middle page is unmapped
+ * and then the mapping containing the first page is expanded so that
+ * it fills the created hole. The two parts should merge creating
+ * single mapping with three pages.
+ */
+static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size)
+{
+ char *test_name = "mremap expand merge";
+ bool success = false;
+ char *remap, *start;
+
+ start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (start == MAP_FAILED) {
+ ksft_print_msg("mmap failed: %s\n", strerror(errno));
+ goto out;
+ }
+
+ munmap(start + page_size, page_size);
+ remap = mremap(start, page_size, 2 * page_size, 0);
+ if (remap == MAP_FAILED) {
+ ksft_print_msg("mremap failed: %s\n", strerror(errno));
+ munmap(start, page_size);
+ munmap(start + 2 * page_size, page_size);
+ goto out;
+ }
+
+ success = is_range_mapped(maps_fp, start, start + 3 * page_size);
+ munmap(start, 3 * page_size);
+
+out:
+ if (success)
+ ksft_test_result_pass("%s\n", test_name);
+ else
+ ksft_test_result_fail("%s\n", test_name);
+}
+
+/*
+ * Similar to mremap_expand_merge() except instead of removing the middle page,
+ * we remove the last then attempt to remap offset from the second page. This
+ * should result in the mapping being restored to its former state.
+ */
+static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size)
+{
+
+ char *test_name = "mremap expand merge offset";
+ bool success = false;
+ char *remap, *start;
+
+ start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (start == MAP_FAILED) {
+ ksft_print_msg("mmap failed: %s\n", strerror(errno));
+ goto out;
+ }
+
+ /* Unmap final page to ensure we have space to expand. */
+ munmap(start + 2 * page_size, page_size);
+ remap = mremap(start + page_size, page_size, 2 * page_size, 0);
+ if (remap == MAP_FAILED) {
+ ksft_print_msg("mremap failed: %s\n", strerror(errno));
+ munmap(start, 2 * page_size);
+ goto out;
+ }
+
+ success = is_range_mapped(maps_fp, start, start + 3 * page_size);
+ munmap(start, 3 * page_size);
+
+out:
if (success)
ksft_test_result_pass("%s\n", test_name);
else
ksft_test_result_fail("%s\n", test_name);
- fclose(fp);
}
/*
@@ -380,11 +442,12 @@ int main(int argc, char **argv)
int i, run_perf_tests;
unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
unsigned int pattern_seed;
- int num_expand_tests = 1;
+ int num_expand_tests = 2;
struct test test_cases[MAX_TEST];
struct test perf_test_cases[MAX_PERF_TEST];
int page_size;
time_t t;
+ FILE *maps_fp;
pattern_seed = (unsigned int) time(&t);
@@ -458,7 +521,17 @@ int main(int argc, char **argv)
run_mremap_test_case(test_cases[i], &failures, threshold_mb,
pattern_seed);
- mremap_expand_merge(page_size);
+ maps_fp = fopen("/proc/self/maps", "r");
+
+ if (maps_fp == NULL) {
+ ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno));
+ exit(KSFT_FAIL);
+ }
+
+ mremap_expand_merge(maps_fp, page_size);
+ mremap_expand_merge_offset(maps_fp, page_size);
+
+ fclose(maps_fp);
if (run_perf_tests) {
ksft_print_msg("\n%s\n",
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
index 634d87dfb2a4..634d87dfb2a4 100644
--- a/tools/testing/selftests/vm/on-fault-limit.c
+++ b/tools/testing/selftests/mm/on-fault-limit.c
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 92f3be3dd8e5..92f3be3dd8e5 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
index 1ebb586b2fbc..1ebb586b2fbc 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/mm/pkey-powerpc.h
diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index 72c14cd3ddc7..72c14cd3ddc7 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 95f403a0c46d..95f403a0c46d 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 8984e0bb58c7..8984e0bb58c7 100755..100644
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/mm/settings
index 9abfc60e9e6f..9abfc60e9e6f 100644
--- a/tools/testing/selftests/vm/settings
+++ b/tools/testing/selftests/mm/settings
diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index 21d8830c5f24..21d8830c5f24 100644
--- a/tools/testing/selftests/vm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 76e1c36dd9e5..76e1c36dd9e5 100644
--- a/tools/testing/selftests/vm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh
index 46e19b5d648d..46e19b5d648d 100755..100644
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/mm/test_hmm.sh
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
index d73b846736f1..d73b846736f1 100755..100644
--- a/tools/testing/selftests/vm/test_vmalloc.sh
+++ b/tools/testing/selftests/mm/test_vmalloc.sh
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 361ef7192cc6..361ef7192cc6 100644
--- a/tools/testing/selftests/vm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index e3f00adb1b82..e3f00adb1b82 100644
--- a/tools/testing/selftests/vm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
index 7f22844ed704..7f22844ed704 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/mm/userfaultfd.c
diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/mm/util.h
index b27d26199334..b27d26199334 100644
--- a/tools/testing/selftests/vm/util.h
+++ b/tools/testing/selftests/mm/util.h
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c
index 1d2068989883..1d2068989883 100644
--- a/tools/testing/selftests/vm/va_128TBswitch.c
+++ b/tools/testing/selftests/mm/va_128TBswitch.c
diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh
index 41580751dc51..41580751dc51 100755..100644
--- a/tools/testing/selftests/vm/va_128TBswitch.sh
+++ b/tools/testing/selftests/mm/va_128TBswitch.sh
diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index c0592646ed93..c0592646ed93 100644
--- a/tools/testing/selftests/vm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 40e795624ff3..40e795624ff3 100644
--- a/tools/testing/selftests/vm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 1995ee911ef2..1995ee911ef2 100644
--- a/tools/testing/selftests/vm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh
index 70a02301f4c2..70a02301f4c2 100644
--- a/tools/testing/selftests/vm/write_hugetlb_memory.sh
+++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh
diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
index 6a2caba19ee1..6a2caba19ee1 100644
--- a/tools/testing/selftests/vm/write_to_hugetlbfs.c
+++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c