summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst82
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst5
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt17
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst35
-rw-r--r--Documentation/dev-tools/kasan.rst23
-rw-r--r--Documentation/dev-tools/kmemleak.rst1
-rw-r--r--Documentation/mm/allocation-profiling.rst7
-rw-r--r--Documentation/mm/damon/index.rst6
-rw-r--r--Documentation/mm/split_page_table_lock.rst6
-rw-r--r--Documentation/translations/zh_CN/dev-tools/kasan.rst20
-rw-r--r--Documentation/translations/zh_TW/dev-tools/kasan.rst21
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/Kconfig8
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/asm/page.h2
-rw-r--r--arch/alpha/include/uapi/asm/mman.h3
-rw-r--r--arch/arc/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/text-patching.h (renamed from arch/arm/include/asm/patch.h)0
-rw-r--r--arch/arm/kernel/ftrace.c2
-rw-r--r--arch/arm/kernel/jump_label.c2
-rw-r--r--arch/arm/kernel/kgdb.c2
-rw-r--r--arch/arm/kernel/patch.c2
-rw-r--r--arch/arm/mm/fault-armv.c53
-rw-r--r--arch/arm/probes/kprobes/core.c2
-rw-r--r--arch/arm/probes/kprobes/opt-arm.c2
-rw-r--r--arch/arm64/include/asm/memory.h2
-rw-r--r--arch/arm64/include/asm/set_memory.h1
-rw-r--r--arch/arm64/include/asm/text-patching.h (renamed from arch/arm64/include/asm/patching.h)0
-rw-r--r--arch/arm64/kernel/ftrace.c2
-rw-r--r--arch/arm64/kernel/jump_label.c2
-rw-r--r--arch/arm64/kernel/kgdb.c2
-rw-r--r--arch/arm64/kernel/patching.c2
-rw-r--r--arch/arm64/kernel/probes/kprobes.c2
-rw-r--r--arch/arm64/kernel/traps.c2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/arm64/mm/pageattr.c16
-rw-r--r--arch/arm64/net/bpf_jit_comp.c2
-rw-r--r--arch/csky/include/asm/Kbuild1
-rw-r--r--arch/hexagon/include/asm/Kbuild1
-rw-r--r--arch/loongarch/include/asm/Kbuild1
-rw-r--r--arch/loongarch/include/asm/hugetlb.h5
-rw-r--r--arch/loongarch/include/asm/pgtable.h3
-rw-r--r--arch/loongarch/include/asm/set_memory.h1
-rw-r--r--arch/loongarch/mm/pageattr.c19
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/page_no.h2
-rw-r--r--arch/microblaze/include/asm/Kbuild1
-rw-r--r--arch/mips/include/asm/Kbuild1
-rw-r--r--arch/mips/include/asm/hugetlb.h5
-rw-r--r--arch/mips/include/asm/pgtable-64.h2
-rw-r--r--arch/mips/include/uapi/asm/mman.h3
-rw-r--r--arch/nios2/include/asm/Kbuild1
-rw-r--r--arch/openrisc/include/asm/Kbuild1
-rw-r--r--arch/parisc/include/asm/hugetlb.h15
-rw-r--r--arch/parisc/include/asm/text-patching.h (renamed from arch/parisc/include/asm/patch.h)0
-rw-r--r--arch/parisc/include/uapi/asm/mman.h3
-rw-r--r--arch/parisc/kernel/ftrace.c2
-rw-r--r--arch/parisc/kernel/jump_label.c2
-rw-r--r--arch/parisc/kernel/kgdb.c2
-rw-r--r--arch/parisc/kernel/kprobes.c2
-rw-r--r--arch/parisc/kernel/patch.c2
-rw-r--r--arch/parisc/mm/hugetlbpage.c21
-rw-r--r--arch/powerpc/include/asm/kprobes.h2
-rw-r--r--arch/powerpc/include/asm/text-patching.h (renamed from arch/powerpc/include/asm/code-patching.h)0
-rw-r--r--arch/powerpc/kernel/crash_dump.c2
-rw-r--r--arch/powerpc/kernel/epapr_paravirt.c2
-rw-r--r--arch/powerpc/kernel/jump_label.c2
-rw-r--r--arch/powerpc/kernel/kgdb.c2
-rw-r--r--arch/powerpc/kernel/kprobes.c2
-rw-r--r--arch/powerpc/kernel/module_32.c2
-rw-r--r--arch/powerpc/kernel/module_64.c2
-rw-r--r--arch/powerpc/kernel/optprobes.c2
-rw-r--r--arch/powerpc/kernel/process.c2
-rw-r--r--arch/powerpc/kernel/security.c2
-rw-r--r--arch/powerpc/kernel/setup_32.c2
-rw-r--r--arch/powerpc/kernel/setup_64.c2
-rw-r--r--arch/powerpc/kernel/static_call.c2
-rw-r--r--arch/powerpc/kernel/trace/ftrace.c2
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_pg.c2
-rw-r--r--arch/powerpc/lib/code-patching.c2
-rw-r--r--arch/powerpc/lib/feature-fixups.c2
-rw-r--r--arch/powerpc/lib/test-code-patching.c2
-rw-r--r--arch/powerpc/lib/test_emulate_step.c2
-rw-r--r--arch/powerpc/mm/book3s32/mmu.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c2
-rw-r--r--arch/powerpc/mm/book3s64/slb.c2
-rw-r--r--arch/powerpc/mm/book3s64/slice.c50
-rw-r--r--arch/powerpc/mm/kasan/init_32.c2
-rw-r--r--arch/powerpc/mm/mem.c2
-rw-r--r--arch/powerpc/mm/nohash/44x.c2
-rw-r--r--arch/powerpc/mm/nohash/book3e_pgtable.c2
-rw-r--r--arch/powerpc/mm/nohash/tlb.c2
-rw-r--r--arch/powerpc/mm/nohash/tlb_64e.c2
-rw-r--r--arch/powerpc/mm/pgtable.c2
-rw-r--r--arch/powerpc/net/bpf_jit_comp.c2
-rw-r--r--arch/powerpc/perf/8xx-pmu.c2
-rw-r--r--arch/powerpc/perf/core-book3s.c2
-rw-r--r--arch/powerpc/platforms/85xx/smp.c2
-rw-r--r--arch/powerpc/platforms/86xx/mpc86xx_smp.c2
-rw-r--r--arch/powerpc/platforms/cell/smp.c2
-rw-r--r--arch/powerpc/platforms/powermac/smp.c2
-rw-r--r--arch/powerpc/platforms/powernv/idle.c2
-rw-r--r--arch/powerpc/platforms/powernv/smp.c2
-rw-r--r--arch/powerpc/platforms/pseries/smp.c2
-rw-r--r--arch/powerpc/xmon/xmon.c2
-rw-r--r--arch/riscv/errata/andes/errata.c2
-rw-r--r--arch/riscv/errata/sifive/errata.c2
-rw-r--r--arch/riscv/errata/thead/errata.c2
-rw-r--r--arch/riscv/include/asm/pgtable.h19
-rw-r--r--arch/riscv/include/asm/set_memory.h1
-rw-r--r--arch/riscv/include/asm/text-patching.h (renamed from arch/riscv/include/asm/patch.h)0
-rw-r--r--arch/riscv/include/asm/uprobes.h2
-rw-r--r--arch/riscv/kernel/alternative.c2
-rw-r--r--arch/riscv/kernel/cpufeature.c3
-rw-r--r--arch/riscv/kernel/ftrace.c2
-rw-r--r--arch/riscv/kernel/jump_label.c2
-rw-r--r--arch/riscv/kernel/patch.c2
-rw-r--r--arch/riscv/kernel/probes/kprobes.c2
-rw-r--r--arch/riscv/mm/pageattr.c15
-rw-r--r--arch/riscv/net/bpf_jit_comp64.c2
-rw-r--r--arch/riscv/net/bpf_jit_core.c2
-rw-r--r--arch/s390/include/asm/hugetlb.h73
-rw-r--r--arch/s390/include/asm/page.h2
-rw-r--r--arch/s390/include/asm/set_memory.h1
-rw-r--r--arch/s390/mm/hugetlbpage.c85
-rw-r--r--arch/s390/mm/mmap.c9
-rw-r--r--arch/s390/mm/pageattr.c12
-rw-r--r--arch/sh/include/asm/Kbuild1
-rw-r--r--arch/sh/include/asm/hugetlb.h15
-rw-r--r--arch/sparc/include/asm/Kbuild1
-rw-r--r--arch/sparc/kernel/sys_sparc_32.c17
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c37
-rw-r--r--arch/sparc/mm/hugetlbpage.c108
-rw-r--r--arch/um/kernel/um_arch.c16
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/entry/vdso/vma.c3
-rw-r--r--arch/x86/include/asm/alternative.h14
-rw-r--r--arch/x86/include/asm/page.h2
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/percpu.h7
-rw-r--r--arch/x86/include/asm/pgtable.h37
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/set_memory.h1
-rw-r--r--arch/x86/include/asm/text-patching.h1
-rw-r--r--arch/x86/kernel/alternative.c181
-rw-r--r--arch/x86/kernel/ftrace.c30
-rw-r--r--arch/x86/kernel/module.c45
-rw-r--r--arch/x86/kernel/sys_x86_64.c24
-rw-r--r--arch/x86/mm/hugetlbpage.c101
-rw-r--r--arch/x86/mm/init.c37
-rw-r--r--arch/x86/mm/init_64.c30
-rw-r--r--arch/x86/mm/kaslr.c14
-rw-r--r--arch/x86/mm/pat/set_memory.c8
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h3
-rw-r--r--drivers/android/binder_alloc.c8
-rw-r--r--drivers/android/binder_alloc.h2
-rw-r--r--drivers/block/zram/Kconfig1
-rw-r--r--drivers/block/zram/zram_drv.c384
-rw-r--r--drivers/block/zram/zram_drv.h3
-rw-r--r--drivers/iommu/iova.c6
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/hugetlbfs/inode.c94
-rw-r--r--fs/inode.c5
-rw-r--r--fs/nfs/nfs42xattr.c4
-rw-r--r--fs/nfsd/filecache.c5
-rw-r--r--fs/proc/base.c26
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_qm.c6
-rw-r--r--include/asm-generic/codetag.lds.h19
-rw-r--r--include/asm-generic/hugetlb.h15
-rw-r--r--include/asm-generic/text-patching.h5
-rw-r--r--include/linux/alloc_tag.h21
-rw-r--r--include/linux/bootmem_info.h35
-rw-r--r--include/linux/codetag.h40
-rw-r--r--include/linux/execmem.h49
-rw-r--r--include/linux/gfp.h6
-rw-r--r--include/linux/highmem.h8
-rw-r--r--include/linux/huge_mm.h16
-rw-r--r--include/linux/hugetlb.h22
-rw-r--r--include/linux/kasan.h12
-rw-r--r--include/linux/khugepaged.h2
-rw-r--r--include/linux/kmemleak.h4
-rw-r--r--include/linux/ksm.h8
-rw-r--r--include/linux/list_lru.h26
-rw-r--r--include/linux/maple_tree.h16
-rw-r--r--include/linux/memcontrol.h97
-rw-r--r--include/linux/mempolicy.h2
-rw-r--r--include/linux/mm.h77
-rw-r--r--include/linux/mm_inline.h27
-rw-r--r--include/linux/mm_types.h84
-rw-r--r--include/linux/mmu_notifier.h7
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--include/linux/module.h16
-rw-r--r--include/linux/moduleloader.h4
-rw-r--r--include/linux/oom.h1
-rw-r--r--include/linux/page-flags-layout.h7
-rw-r--r--include/linux/page-flags.h18
-rw-r--r--include/linux/page-isolation.h8
-rw-r--r--include/linux/pagemap.h31
-rw-r--r--include/linux/pagewalk.h18
-rw-r--r--include/linux/pgalloc_tag.h202
-rw-r--r--include/linux/pgtable.h59
-rw-r--r--include/linux/rmap.h17
-rw-r--r--include/linux/sched/coredump.h82
-rw-r--r--include/linux/set_memory.h6
-rw-r--r--include/linux/shmem_fs.h6
-rw-r--r--include/linux/swapops.h24
-rw-r--r--include/linux/text-patching.h15
-rw-r--r--include/linux/vmalloc.h63
-rw-r--r--include/linux/zswap.h2
-rw-r--r--include/trace/events/memcg.h106
-rw-r--r--include/trace/events/mmap_lock.h14
-rw-r--r--include/trace/events/vmscan.h45
-rw-r--r--include/uapi/asm-generic/mman-common.h3
-rw-r--r--kernel/events/uprobes.c1
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/module/debug_kmemleak.c3
-rw-r--r--kernel/module/main.c148
-rw-r--r--kernel/module/strict_rwx.c3
-rw-r--r--kernel/resource.c4
-rw-r--r--lib/Kconfig.debug1
-rw-r--r--lib/Kconfig.kasan7
-rw-r--r--lib/alloc_tag.c515
-rw-r--r--lib/codetag.c104
-rw-r--r--lib/maple_tree.c249
-rw-r--r--lib/percpu_test.c11
-rw-r--r--lib/strncpy_from_user.c5
-rw-r--r--lib/test_maple_tree.c90
-rw-r--r--mm/bootmem_info.c11
-rw-r--r--mm/cma.c12
-rw-r--r--mm/damon/Kconfig2
-rw-r--r--mm/damon/tests/dbgfs-kunit.h2
-rw-r--r--mm/damon/tests/vaddr-kunit.h4
-rw-r--r--mm/damon/vaddr.c9
-rw-r--r--mm/execmem.c352
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c227
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/internal.h52
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/hw_tags.c7
-rw-r--r--mm/kasan/init.c12
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/kasan_test_c.c118
-rw-r--r--mm/kasan/kasan_test_module.c81
-rw-r--r--mm/kasan/report.c19
-rw-r--r--mm/kasan/shadow.c14
-rw-r--r--mm/kfence/kfence_test.c17
-rw-r--r--mm/khugepaged.c31
-rw-r--r--mm/kmemleak.c41
-rw-r--r--mm/kmsan/kmsan_test.c17
-rw-r--r--mm/ksm.c110
-rw-r--r--mm/list_lru.c383
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/madvise.c298
-rw-r--r--mm/memcontrol-v1.c983
-rw-r--r--mm/memcontrol-v1.h6
-rw-r--r--mm/memcontrol.c210
-rw-r--r--mm/memory-failure.c32
-rw-r--r--mm/memory.c67
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mm_init.c5
-rw-r--r--mm/mmap.c276
-rw-r--r--mm/mmap_lock.c39
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c104
-rw-r--r--mm/mseal.c1
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page-writeback.c45
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/page_vma_mapped.c32
-rw-r--r--mm/pagewalk.c246
-rw-r--r--mm/percpu.c11
-rw-r--r--mm/pgtable-generic.c41
-rw-r--r--mm/process_vm_access.c4
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c45
-rw-r--r--mm/shmem.c345
-rw-r--r--mm/show_mem.c3
-rw-r--r--mm/sparse-vmemmap.c12
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c31
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/truncate.c103
-rw-r--r--mm/userfaultfd.c17
-rw-r--r--mm/util.c2
-rw-r--r--mm/vma.c447
-rw-r--r--mm/vma.h97
-rw-r--r--mm/vma_internal.h5
-rw-r--r--mm/vmalloc.c52
-rw-r--r--mm/vmscan.c68
-rw-r--r--mm/vmstat.c28
-rw-r--r--mm/workingset.c34
-rw-r--r--mm/zsmalloc.c86
-rw-r--r--mm/zswap.c245
-rw-r--r--scripts/module.lds.S5
-rw-r--r--tools/include/uapi/asm-generic/mman-common.h3
-rw-r--r--tools/mm/page_owner_sort.c1
-rw-r--r--tools/mm/slabinfo.c10
-rw-r--r--tools/testing/radix-tree/maple.c22
-rw-r--r--tools/testing/selftests/damon/_debugfs_common.sh7
-rw-r--r--tools/testing/selftests/damon/access_memory_even.c2
-rwxr-xr-xtools/testing/selftests/damon/debugfs_duplicate_context_creation.sh2
-rw-r--r--tools/testing/selftests/damon/huge_count_read_write.c4
-rw-r--r--tools/testing/selftests/mm/.gitignore4
-rw-r--r--tools/testing/selftests/mm/Makefile3
-rw-r--r--tools/testing/selftests/mm/guard-pages.c1243
-rw-r--r--tools/testing/selftests/mm/hugetlb_fault_after_madv.c48
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh10
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c4
-rw-r--r--tools/testing/shared/shared.mk1
-rw-r--r--tools/testing/vma/vma.c2
-rw-r--r--tools/testing/vma/vma_internal.h115
323 files changed, 7350 insertions, 4391 deletions
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index 678d70d6e1c3..714a5171bfc0 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -47,6 +47,8 @@ The list of possible return codes:
-ENOMEM zram was not able to allocate enough memory to fulfil your
needs.
-EINVAL invalid input has been provided.
+-EAGAIN re-try operation later (e.g. when attempting to run recompress
+ and writeback simultaneously).
======== =============================================================
If you use 'echo', the returned value is set by the 'echo' utility,
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 270501db9f4e..286d16fc22eb 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -90,9 +90,7 @@ Brief summary of control files.
used.
memory.swappiness set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate set/show controls of moving charges
- This knob is deprecated and shouldn't be
- used.
+ memory.move_charge_at_immigrate This knob is deprecated.
memory.oom_control set/show oom controls.
This knob is deprecated and shouldn't be
used.
@@ -243,10 +241,6 @@ behind this approach is that a cgroup that aggressively uses a shared
page will eventually get charged for it (once it is uncharged from
the cgroup that brought it in -- this will happen on memory pressure).
-But see :ref:`section 8.2 <cgroup-v1-memory-movable-charges>` when moving a
-task to another cgroup, its pages may be recharged to the new cgroup, if
-move_charge_at_immigrate has been chosen.
-
2.4 Swap Extension
--------------------------------------
@@ -756,78 +750,8 @@ If we want to change this to 1G, we can at any time use::
THIS IS DEPRECATED!
-It's expensive and unreliable! It's better practice to launch workload
-tasks directly from inside their target cgroup. Use dedicated workload
-cgroups to allow fine-grained policy adjustments without having to
-move physical pages between control domains.
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
--------------
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it::
-
- # echo (some positive value) > memory.move_charge_at_immigrate
-
-.. note::
- Each bits of move_charge_at_immigrate has its own meaning about what type
- of charges should be moved. See :ref:`section 8.2
- <cgroup-v1-memory-movable-charges>` for details.
-
-.. note::
- Charges are moved only when you move mm->owner, in other words,
- a leader of a thread group.
-
-.. note::
- If we cannot find enough space for the task in the destination cgroup, we
- try to make space by reclaiming memory. Task migration may fail if we
- cannot make enough space.
-
-.. note::
- It can take several seconds if you move charges much.
-
-And if you want disable it again::
-
- # echo 0 > memory.move_charge_at_immigrate
-
-.. _cgroup-v1-memory-movable-charges:
-
-8.2 Type of charges which can be moved
---------------------------------------
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-+---+--------------------------------------------------------------------------+
-|bit| what type of charges would be moved ? |
-+===+==========================================================================+
-| 0 | A charge of an anonymous page (or swap of it) used by the target task. |
-| | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
-+---+--------------------------------------------------------------------------+
-| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
-| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of |
-| | anonymous pages, file pages (and swaps) in the range mmapped by the task |
-| | will be moved even if the task hasn't done page fault, i.e. they might |
-| | not be the task's "RSS", but other task's "RSS" that maps the same file. |
-| | The mapcount of the page is ignored (the page can be moved independent |
-| | of the mapcount). You must enable Swap Extension (see 2.4) to |
-| | enable move of swap charges. |
-+---+--------------------------------------------------------------------------+
-
-8.3 TODO
---------
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
- behavior to hold the mutex too long, so we may need some trick.
+Reading memory.move_charge_at_immigrate will always return 0 and writing
+to it will always return -EINVAL.
9. Memory thresholds
====================
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 2cb58daf3089..315ede811c9d 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1655,6 +1655,11 @@ The following nested keys are defined.
pgdemote_khugepaged
Number of pages demoted by khugepaged.
+ hugetlb
+ Amount of memory used by hugetlb pages. This metric only shows
+ up if hugetlb usage is accounted for in memory.current (i.e.
+ cgroup is mounted with the memory_hugetlb_accounting option).
+
memory.numa_stat
A read-only nested-keyed file which exists on non-root cgroups.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9cd9cd06538b..a4736ee87b1a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6711,6 +6711,16 @@
Force threading of all interrupt handlers except those
marked explicitly IRQF_NO_THREAD.
+ thp_shmem= [KNL]
+ Format: <size>[KMG],<size>[KMG]:<policy>;<size>[KMG]-<size>[KMG]:<policy>
+ Control the default policy of each hugepage size for the
+ internal shmem mount. <policy> is one of policies available
+ for the shmem mount ("always", "inherit", "never", "within_size",
+ and "advise").
+ It can be used multiple times for multiple shmem THP sizes.
+ See Documentation/admin-guide/mm/transhuge.rst for more
+ details.
+
topology= [S390,EARLY]
Format: {off | on}
Specify if the kernel should make use of the cpu
@@ -6952,6 +6962,13 @@
See Documentation/admin-guide/mm/transhuge.rst
for more details.
+ transparent_hugepage_shmem= [KNL]
+ Format: [always|within_size|advise|never|deny|force]
+ Can be used to control the hugepage allocation policy for
+ the internal shmem mount.
+ See Documentation/admin-guide/mm/transhuge.rst
+ for more details.
+
trusted.source= [KEYS]
Format: <string>
This parameter identifies the trust source as a backend
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index a1bb495eab59..5034915f4e8e 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -326,6 +326,29 @@ PMD_ORDER THP policy will be overridden. If the policy for PMD_ORDER
is not defined within a valid ``thp_anon``, its policy will default to
``never``.
+Similarly to ``transparent_hugepage``, you can control the hugepage
+allocation policy for the internal shmem mount by using the kernel parameter
+``transparent_hugepage_shmem=<policy>``, where ``<policy>`` is one of the
+seven valid policies for shmem (``always``, ``within_size``, ``advise``,
+``never``, ``deny``, and ``force``).
+
+In the same manner as ``thp_anon`` controls each supported anonymous THP
+size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem``
+has the same format as ``thp_anon``, but also supports the policy
+``within_size``.
+
+``thp_shmem=`` may be specified multiple times to configure all THP sizes
+as required. If ``thp_shmem=`` is specified at least once, any shmem THP
+sizes not explicitly configured on the command line are implicitly set to
+``never``.
+
+``transparent_hugepage_shmem`` setting only affects the global toggle. If
+``thp_shmem`` is not specified, PMD_ORDER hugepage will default to
+``inherit``. However, if a valid ``thp_shmem`` setting is provided by the
+user, the PMD_ORDER hugepage policy will be overridden. If the policy for
+PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will
+default to ``never``.
+
Hugepages in tmpfs/shmem
========================
@@ -530,10 +553,18 @@ anon_fault_fallback_charge
instead falls back to using huge pages with lower orders or
small pages even though the allocation was successful.
-swpout
- is incremented every time a huge page is swapped out in one
+zswpout
+ is incremented every time a huge page is swapped out to zswap in one
piece without splitting.
+swpin
+ is incremented every time a huge page is swapped in from a non-zswap
+ swap device in one piece.
+
+swpout
+ is incremented every time a huge page is swapped out to a non-zswap
+ swap device in one piece without splitting.
+
swpout_fallback
is incremented if a huge page has to be split before swapout.
Usually because failed to allocate some continuous swap space
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index d7de44f5339d..0a1418ab72fd 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -511,19 +511,14 @@ Tests
~~~~~
There are KASAN tests that allow verifying that KASAN works and can detect
-certain types of memory corruptions. The tests consist of two parts:
+certain types of memory corruptions.
-1. Tests that are integrated with the KUnit Test Framework. Enabled with
-``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified
+All KASAN tests are integrated with the KUnit Test Framework and can be enabled
+via ``CONFIG_KASAN_KUNIT_TEST``. The tests can be run and partially verified
automatically in a few different ways; see the instructions below.
-2. Tests that are currently incompatible with KUnit. Enabled with
-``CONFIG_KASAN_MODULE_TEST`` and can only be run as a module. These tests can
-only be verified manually by loading the kernel module and inspecting the
-kernel log for KASAN reports.
-
-Each KUnit-compatible KASAN test prints one of multiple KASAN reports if an
-error is detected. Then the test prints its number and status.
+Each KASAN test prints one of multiple KASAN reports if an error is detected.
+Then the test prints its number and status.
When a test passes::
@@ -550,16 +545,16 @@ Or, if one of the tests failed::
not ok 1 - kasan
-There are a few ways to run KUnit-compatible KASAN tests.
+There are a few ways to run the KASAN tests.
1. Loadable module
- With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable
- module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``.
+ With ``CONFIG_KUNIT`` enabled, the tests can be built as a loadable module
+ and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``.
2. Built-In
- With ``CONFIG_KUNIT`` built-in, KASAN-KUnit tests can be built-in as well.
+ With ``CONFIG_KUNIT`` built-in, the tests can be built-in as well.
In this case, the tests will run at boot as a late-init call.
3. Using kunit_tool
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 2cb00b53339f..7d784e03f3f9 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -161,6 +161,7 @@ See the include/linux/kmemleak.h header for the functions prototype.
- ``kmemleak_free_percpu`` - notify of a percpu memory block freeing
- ``kmemleak_update_trace`` - update object allocation stack trace
- ``kmemleak_not_leak`` - mark an object as not a leak
+- ``kmemleak_transient_leak`` - mark an object as a transient leak
- ``kmemleak_ignore`` - do not scan or report an object as leak
- ``kmemleak_scan_area`` - add scan areas inside a memory block
- ``kmemleak_no_scan`` - do not scan a memory block
diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
index ffd6655b7be2..316311240e6a 100644
--- a/Documentation/mm/allocation-profiling.rst
+++ b/Documentation/mm/allocation-profiling.rst
@@ -18,12 +18,17 @@ kconfig options:
missing annotation
Boot parameter:
- sysctl.vm.mem_profiling=0|1|never
+ sysctl.vm.mem_profiling={0|1|never}[,compressed]
When set to "never", memory allocation profiling overhead is minimized and it
cannot be enabled at runtime (sysctl becomes read-only).
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y, default value is "1".
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, default value is "never".
+ "compressed" optional parameter will try to store page tag references in a
+ compact format, avoiding page extensions. This results in improved performance
+ and memory consumption, however it might fail depending on system configuration.
+ If compression fails, a warning is issued and memory allocation profiling gets
+ disabled.
sysctl:
/proc/sys/vm/mem_profiling
diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst
index dafd6d028924..5a3359704cce 100644
--- a/Documentation/mm/damon/index.rst
+++ b/Documentation/mm/damon/index.rst
@@ -37,3 +37,9 @@ with no code but simple configurations.
To utilize and control DAMON from the user-space, please refer to the
administration :doc:`guide </admin-guide/mm/damon/index>`.
+
+If you prefer academic papers for reading and citations, please use the papers
+from `HPDC'22 <https://dl.acm.org/doi/abs/10.1145/3502181.3531466>`_ and
+`Middleware19 Industry <https://dl.acm.org/doi/abs/10.1145/3366626.3368125>`_ .
+Note that those cover DAMON implementations in Linux v5.16 and v5.15,
+respectively.
diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
index e4f6972eb6c0..581446d4a4eb 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -16,9 +16,13 @@ There are helpers to lock/unlock a table and other accessor functions:
- pte_offset_map_lock()
maps PTE and takes PTE table lock, returns pointer to PTE with
pointer to its PTE table lock, or returns NULL if no PTE table;
- - pte_offset_map_nolock()
+ - pte_offset_map_ro_nolock()
maps PTE, returns pointer to PTE with pointer to its PTE table
lock (not taken), or returns NULL if no PTE table;
+ - pte_offset_map_rw_nolock()
+ maps PTE, returns pointer to PTE with pointer to its PTE table
+ lock (not taken) and the value of its pmd entry, or returns NULL
+ if no PTE table;
- pte_offset_map()
maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
- pte_unmap()
diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst
index 4491ad2830ed..fd2e3afbdfad 100644
--- a/Documentation/translations/zh_CN/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst
@@ -422,16 +422,12 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。
~~~~
有一些KASAN测试可以验证KASAN是否正常工作并可以检测某些类型的内存损坏。
-测试由两部分组成:
-1. 与KUnit测试框架集成的测试。使用 ``CONFIG_KASAN_KUNIT_TEST`` 启用。
-这些测试可以通过几种不同的方式自动运行和部分验证;请参阅下面的说明。
+所有 KASAN 测试都与 KUnit 测试框架集成,可通过 ``CONFIG_KASAN_KUNIT_TEST`` 启用。
+测试可以通过几种不同的方式自动运行和部分验证;请参阅以下说明。
-2. 与KUnit不兼容的测试。使用 ``CONFIG_KASAN_MODULE_TEST`` 启用并且只能作为模块
-运行。这些测试只能通过加载内核模块并检查内核日志以获取KASAN报告来手动验证。
-
-如果检测到错误,每个KUnit兼容的KASAN测试都会打印多个KASAN报告之一,然后测试打印
-其编号和状态。
+如果检测到错误,每个 KASAN 测试都会打印多份 KASAN 报告中的一份。
+然后测试会打印其编号和状态。
当测试通过::
@@ -458,16 +454,16 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。
not ok 1 - kasan
-有几种方法可以运行与KUnit兼容的KASAN测试。
+有几种方法可以运行 KASAN 测试。
1. 可加载模块
- 启用 ``CONFIG_KUNIT`` 后,KASAN-KUnit测试可以构建为可加载模块,并通过使用
- ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。
+ 启用 ``CONFIG_KUNIT`` 后,可以将测试构建为可加载模块
+ 并通过使用 ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。
2. 内置
- 通过内置 ``CONFIG_KUNIT`` ,也可以内置KASAN-KUnit测试。在这种情况下,
+ 通过内置 ``CONFIG_KUNIT``,测试也可以内置。
测试将在启动时作为后期初始化调用运行。
3. 使用kunit_tool
diff --git a/Documentation/translations/zh_TW/dev-tools/kasan.rst b/Documentation/translations/zh_TW/dev-tools/kasan.rst
index ed342e67d8ed..27fb7645174d 100644
--- a/Documentation/translations/zh_TW/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_TW/dev-tools/kasan.rst
@@ -404,16 +404,13 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。
~~~~
有一些KASAN測試可以驗證KASAN是否正常工作並可以檢測某些類型的內存損壞。
-測試由兩部分組成:
-1. 與KUnit測試框架集成的測試。使用 ``CONFIG_KASAN_KUNIT_TEST`` 啓用。
-這些測試可以通過幾種不同的方式自動運行和部分驗證;請參閱下面的說明。
+所有 KASAN 測試均與 KUnit 測試框架集成,並且可以啟用
+透過 ``CONFIG_KASAN_KUNIT_TEST``。可以運行測試並進行部分驗證
+以幾種不同的方式自動進行;請參閱下面的說明。
-2. 與KUnit不兼容的測試。使用 ``CONFIG_KASAN_MODULE_TEST`` 啓用並且只能作爲模塊
-運行。這些測試只能通過加載內核模塊並檢查內核日誌以獲取KASAN報告來手動驗證。
-
-如果檢測到錯誤,每個KUnit兼容的KASAN測試都會打印多個KASAN報告之一,然後測試打印
-其編號和狀態。
+如果偵測到錯誤,每個 KASAN 測試都會列印多個 KASAN 報告之一。
+然後測試列印其編號和狀態。
當測試通過::
@@ -440,16 +437,16 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。
not ok 1 - kasan
-有幾種方法可以運行與KUnit兼容的KASAN測試。
+有幾種方法可以執行 KASAN 測試。
1. 可加載模塊
- 啓用 ``CONFIG_KUNIT`` 後,KASAN-KUnit測試可以構建爲可加載模塊,並通過使用
- ``insmod`` 或 ``modprobe`` 加載 ``kasan_test.ko`` 來運行。
+ 啟用 ``CONFIG_KUNIT`` 後,測試可以建置為可載入模組
+ 並且透過使用 ``insmod`` 或 ``modprobe`` 來載入 ``kasan_test.ko`` 來運作。
2. 內置
- 通過內置 ``CONFIG_KUNIT`` ,也可以內置KASAN-KUnit測試。在這種情況下,
+ 透過內建 ``CONFIG_KUNIT``,測試也可以內建。
測試將在啓動時作爲後期初始化調用運行。
3. 使用kunit_tool
diff --git a/MAINTAINERS b/MAINTAINERS
index e40e1220c733..50f7b27a86f4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14957,6 +14957,8 @@ S: Maintained
W: http://www.linux-mm.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new
+F: Documentation/admin-guide/mm/
+F: Documentation/mm/
F: include/linux/gfp.h
F: include/linux/gfp_types.h
F: include/linux/memfd.h
diff --git a/arch/Kconfig b/arch/Kconfig
index de5200eb55d1..9446aad5d605 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1025,6 +1025,14 @@ config ARCH_WANTS_EXECMEM_LATE
enough entropy for module space randomization, for instance
arm64.
+config ARCH_HAS_EXECMEM_ROX
+ bool
+ depends on MMU && !HIGHMEM
+ help
+ For architectures that support allocations of executable memory
+ with read-only execute permissions. Architecture must implement
+ execmem_fill_trapping_insns() callback to enable this.
+
config HAVE_IRQ_EXIT_ON_IRQ_STACK
bool
help
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 396caece6d6d..483965c5a4de 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += agp.h
generic-y += asm-offsets.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += text-patching.h
diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h
index 261af54fd601..5ec4c77e432e 100644
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -14,7 +14,7 @@ extern void clear_page(void *page);
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
- vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 763929e814e9..1e700468a685 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -78,6 +78,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
+#define MADV_GUARD_REMOVE 103 /* unguard range */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 49285a3ce239..4c69522e0328 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -6,3 +6,4 @@ generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += parport.h
generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/arm/include/asm/patch.h b/arch/arm/include/asm/text-patching.h
index 0b48247c4600..0b48247c4600 100644
--- a/arch/arm/include/asm/patch.h
+++ b/arch/arm/include/asm/text-patching.h
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index e61591f33a6c..845acf9ce21e 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -23,7 +23,7 @@
#include <asm/insn.h>
#include <asm/set_memory.h>
#include <asm/stacktrace.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
/*
* The compiler emitted profiling hook consists of
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index eb9c24b6e8e2..a06a92d0f550 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/jump_label.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/insn.h>
static void __arch_jump_label_transform(struct jump_entry *entry,
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 22f937e6f3ff..ab76c55fd610 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -15,7 +15,7 @@
#include <linux/kgdb.h>
#include <linux/uaccess.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/traps.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index e9e828b6bb30..4d45e60cd46d 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -9,7 +9,7 @@
#include <asm/fixmap.h>
#include <asm/smp_plat.h>
#include <asm/opcodes.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
struct patch {
void *addr;
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 831793cd6ff9..2bec87c3327d 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -61,32 +61,8 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
return ret;
}
-#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
-/*
- * If we are using split PTE locks, then we need to take the page
- * lock here. Otherwise we are using shared mm->page_table_lock
- * which is already locked, thus cannot take it.
- */
-static inline void do_pte_lock(spinlock_t *ptl)
-{
- /*
- * Use nested version here to indicate that we are already
- * holding one similar spinlock.
- */
- spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
-}
-
-static inline void do_pte_unlock(spinlock_t *ptl)
-{
- spin_unlock(ptl);
-}
-#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
-static inline void do_pte_lock(spinlock_t *ptl) {}
-static inline void do_pte_unlock(spinlock_t *ptl) {}
-#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
-
static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
- unsigned long pfn)
+ unsigned long pfn, struct vm_fault *vmf)
{
spinlock_t *ptl;
pgd_t *pgd;
@@ -94,6 +70,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pmd_t pmdval;
int ret;
pgd = pgd_offset(vma->vm_mm, address);
@@ -112,20 +89,33 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
if (pmd_none_or_clear_bad(pmd))
return 0;
+again:
/*
* This is called while another page table is mapped, so we
* must use the nested version. This also means we need to
* open-code the spin-locking.
*/
- pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
+ pte = pte_offset_map_rw_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl);
if (!pte)
return 0;
- do_pte_lock(ptl);
+ /*
+ * If we are using split PTE locks, then we need to take the page
+ * lock here. Otherwise we are using shared mm->page_table_lock
+ * which is already locked, thus cannot take it.
+ */
+ if (ptl != vmf->ptl) {
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ pte_unmap_unlock(pte, ptl);
+ goto again;
+ }
+ }
ret = do_adjust_pte(vma, address, pfn, pte);
- do_pte_unlock(ptl);
+ if (ptl != vmf->ptl)
+ spin_unlock(ptl);
pte_unmap(pte);
return ret;
@@ -133,7 +123,8 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
static void
make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep, unsigned long pfn)
+ unsigned long addr, pte_t *ptep, unsigned long pfn,
+ struct vm_fault *vmf)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *mpnt;
@@ -160,7 +151,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
if (!(mpnt->vm_flags & VM_MAYSHARE))
continue;
offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
- aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn);
+ aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf);
}
flush_dcache_mmap_unlock(mapping);
if (aliases)
@@ -203,7 +194,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
__flush_dcache_folio(mapping, folio);
if (mapping) {
if (cache_is_vivt())
- make_coherent(mapping, vma, addr, ptep, pfn);
+ make_coherent(mapping, vma, addr, ptep, pfn, vmf);
else if (vma->vm_flags & VM_EXEC)
__flush_icache_all();
}
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index d8238da095df..9fd877c87a38 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -25,7 +25,7 @@
#include <asm/cacheflush.h>
#include <linux/percpu.h>
#include <linux/bug.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/sections.h>
#include "../decode-arm.h"
diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c
index 7f65048380ca..966c6042c5ad 100644
--- a/arch/arm/probes/kprobes/opt-arm.c
+++ b/arch/arm/probes/kprobes/opt-arm.c
@@ -14,7 +14,7 @@
/* for arm_gen_branch */
#include <asm/insn.h>
/* for patch_text */
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include "core.h"
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index b9b992908a56..8b9f33cf561b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -110,7 +110,7 @@
#define PAGE_END (_PAGE_END(VA_BITS_MIN))
#endif /* CONFIG_KASAN */
-#define PHYSMEM_END __pa(PAGE_END - 1)
+#define DIRECT_MAP_PHYSMEM_END __pa(PAGE_END - 1)
#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT)
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 37774c793006..90f61b17275e 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -13,6 +13,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
bool kernel_page_present(struct page *page);
int set_memory_encrypted(unsigned long addr, int numpages);
diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/text-patching.h
index 587bdb91ab7a..587bdb91ab7a 100644
--- a/arch/arm64/include/asm/patching.h
+++ b/arch/arm64/include/asm/text-patching.h
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index b2d947175cbe..245cb419ca24 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -15,7 +15,7 @@
#include <asm/debug-monitors.h>
#include <asm/ftrace.h>
#include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
struct fregs_offset {
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index f63ea915d6ad..b345425193d2 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -9,7 +9,7 @@
#include <linux/jump_label.h>
#include <linux/smp.h>
#include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 4e1f983df3d1..f3c4d3a8a20f 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -17,7 +17,7 @@
#include <asm/debug-monitors.h>
#include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
#include <asm/traps.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
index 945df74005c7..7f99723fbb8c 100644
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@@ -10,7 +10,7 @@
#include <asm/fixmap.h>
#include <asm/insn.h>
#include <asm/kprobes.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
#include <asm/sections.h>
static DEFINE_RAW_SPINLOCK(patch_lock);
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 48d88e07611d..d9e462eafb95 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -27,7 +27,7 @@
#include <asm/debug-monitors.h>
#include <asm/insn.h>
#include <asm/irq.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
#include <asm/ptrace.h>
#include <asm/sections.h>
#include <asm/system_misc.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index ee318f6df647..4e26bd356a48 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -41,7 +41,7 @@
#include <asm/extable.h>
#include <asm/insn.h>
#include <asm/kprobes.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
#include <asm/traps.h>
#include <asm/smp.h>
#include <asm/stack_pointer.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c2f89a678ac0..ef63651099a9 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -1023,7 +1023,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
if (vma->vm_flags & VM_MTE)
flags |= __GFP_ZEROTAGS;
- return vma_alloc_folio(flags, 0, vma, vaddr, false);
+ return vma_alloc_folio(flags, 0, vma, vaddr);
}
void tag_clear_highpage(struct page *page)
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 6ae6ae806454..39fd1f7ff02a 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -282,7 +282,23 @@ int realm_register_memory_enc_ops(void)
return arm64_mem_crypt_ops_register(&realm_crypt_ops);
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ if (!can_set_direct_map())
+ return 0;
+
+ return set_memory_valid(addr, nr, valid);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
+/*
+ * This is - apart from the return value - doing the same
+ * thing as the new set_direct_map_valid_noflush() function.
+ *
+ * Unify? Explain the conceptual differences?
+ */
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (!can_set_direct_map())
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 27ef366363e4..66708b95493a 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -19,7 +19,7 @@
#include <asm/cacheflush.h>
#include <asm/debug-monitors.h>
#include <asm/insn.h>
-#include <asm/patching.h>
+#include <asm/text-patching.h>
#include <asm/set_memory.h>
#include "bpf_jit.h"
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 9a9bc65b57a9..3a5c7f6e5aac 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += qspinlock.h
generic-y += parport.h
generic-y += user.h
generic-y += vmlinux.lds.h
+generic-y += text-patching.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 8c1a78c8f527..1efa1e993d4b 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += extable.h
generic-y += iomap.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 5b5a6c90e6e2..80ddb5edb845 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += ioctl.h
generic-y += mmzone.h
generic-y += statfs.h
generic-y += param.h
+generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h
index 5da32c00d483..b837c65a4894 100644
--- a/arch/loongarch/include/asm/hugetlb.h
+++ b/arch/loongarch/include/asm/hugetlb.h
@@ -16,12 +16,7 @@ static inline int prepare_hugepage_range(struct file *file,
unsigned long len)
{
unsigned long task_size = STACK_TOP;
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
if (len > task_size)
return -ENOMEM;
if (task_size - len < addr)
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 20714b73f14c..da346733a1da 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -268,8 +268,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm
*/
extern void pgd_init(void *addr);
extern void pud_init(void *addr);
+#define pud_init pud_init
extern void pmd_init(void *addr);
+#define pmd_init pmd_init
extern void kernel_pte_init(void *addr);
+#define kernel_pte_init kernel_pte_init
/*
* Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h
index d70505b6676c..55dfaefd02c8 100644
--- a/arch/loongarch/include/asm/set_memory.h
+++ b/arch/loongarch/include/asm/set_memory.h
@@ -17,5 +17,6 @@ int set_memory_rw(unsigned long addr, int numpages);
bool kernel_page_present(struct page *page);
int set_direct_map_default_noflush(struct page *page);
int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
#endif /* _ASM_LOONGARCH_SET_MEMORY_H */
diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
index ffd8d76021d4..bf8678248444 100644
--- a/arch/loongarch/mm/pageattr.c
+++ b/arch/loongarch/mm/pageattr.c
@@ -216,3 +216,22 @@ int set_direct_map_invalid_noflush(struct page *page)
return __set_memory(addr, 1, __pgprot(0), __pgprot(_PAGE_PRESENT | _PAGE_VALID));
}
+
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+ pgprot_t set, clear;
+
+ if (addr < vm_map_base)
+ return 0;
+
+ if (valid) {
+ set = PAGE_KERNEL;
+ clear = __pgprot(0);
+ } else {
+ set = __pgprot(0);
+ clear = __pgprot(_PAGE_PRESENT | _PAGE_VALID);
+ }
+
+ return __set_memory(addr, 1, set, clear);
+}
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 0dbf9c5c6fae..b282e0dd8dc1 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += spinlock.h
+generic-y += text-patching.h
diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h
index af3a10973233..63c0e706084b 100644
--- a/arch/m68k/include/asm/page_no.h
+++ b/arch/m68k/include/asm/page_no.h
@@ -14,7 +14,7 @@ extern unsigned long memory_end;
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
- vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
#define __pa(vaddr) ((unsigned long)(vaddr))
#define __va(paddr) ((void *)((unsigned long)(paddr)))
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index a055f5dbe00a..7178f990e8b3 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -8,3 +8,4 @@ generic-y += parport.h
generic-y += syscalls.h
generic-y += tlb.h
generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 7ba67a0d6c97..684569b2ecd6 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -13,3 +13,4 @@ generic-y += parport.h
generic-y += qrwlock.h
generic-y += qspinlock.h
generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index fd69c8808554..d0a86ce83de9 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -17,12 +17,7 @@ static inline int prepare_hugepage_range(struct file *file,
unsigned long len)
{
unsigned long task_size = STACK_TOP;
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
if (len > task_size)
return -ENOMEM;
if (task_size - len < addr)
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 401c1d9e4409..6e854bb11f37 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -317,7 +317,9 @@ static inline pmd_t *pud_pgtable(pud_t pud)
*/
extern void pgd_init(void *addr);
extern void pud_init(void *addr);
+#define pud_init pud_init
extern void pmd_init(void *addr);
+#define pmd_init pmd_init
/*
* Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 9c48d9a21aa0..b700dae28c48 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -105,6 +105,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
+#define MADV_GUARD_REMOVE 103 /* unguard range */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 0d09829ed144..28004301c236 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -7,3 +7,4 @@ generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += spinlock.h
generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index cef49d60d74c..2b1a6b00cdac 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -9,3 +9,4 @@ generic-y += spinlock.h
generic-y += qrwlock_types.h
generic-y += qrwlock.h
generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h
index 72daacc472a0..5b3a5429f71b 100644
--- a/arch/parisc/include/asm/hugetlb.h
+++ b/arch/parisc/include/asm/hugetlb.h
@@ -12,21 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- if (len & ~HPAGE_MASK)
- return -EINVAL;
- if (addr & ~HPAGE_MASK)
- return -EINVAL;
- return 0;
-}
-
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
diff --git a/arch/parisc/include/asm/patch.h b/arch/parisc/include/asm/text-patching.h
index 400d84c6e504..400d84c6e504 100644
--- a/arch/parisc/include/asm/patch.h
+++ b/arch/parisc/include/asm/text-patching.h
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 68c44f99bc93..b6a709506987 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -75,6 +75,9 @@
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
+#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
+#define MADV_GUARD_REMOVE 103 /* unguard range */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index c91f9c2e61ed..3e34b4473d3a 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -20,7 +20,7 @@
#include <asm/assembly.h>
#include <asm/sections.h>
#include <asm/ftrace.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#define __hot __section(".text.hot")
diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c
index e253b134500d..ea51f15bf0e6 100644
--- a/arch/parisc/kernel/jump_label.c
+++ b/arch/parisc/kernel/jump_label.c
@@ -8,7 +8,7 @@
#include <linux/jump_label.h>
#include <linux/bug.h>
#include <asm/alternative.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
static inline int reassemble_17(int as17)
{
diff --git a/arch/parisc/kernel/kgdb.c b/arch/parisc/kernel/kgdb.c
index b16fa9bac5f4..fee81f877525 100644
--- a/arch/parisc/kernel/kgdb.c
+++ b/arch/parisc/kernel/kgdb.c
@@ -16,7 +16,7 @@
#include <asm/ptrace.h>
#include <asm/traps.h>
#include <asm/processor.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/cacheflush.h>
const struct kgdb_arch arch_kgdb_ops = {
diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c
index 6e0b86652f30..9255adba67a3 100644
--- a/arch/parisc/kernel/kprobes.c
+++ b/arch/parisc/kernel/kprobes.c
@@ -12,7 +12,7 @@
#include <linux/kprobes.h>
#include <linux/slab.h>
#include <asm/cacheflush.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c
index e59574f65e64..35dd764b871e 100644
--- a/arch/parisc/kernel/patch.c
+++ b/arch/parisc/kernel/patch.c
@@ -13,7 +13,7 @@
#include <asm/cacheflush.h>
#include <asm/fixmap.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
struct patch {
void *addr;
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index aa664f7ddb63..e9d18cf25b79 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -21,27 +21,6 @@
#include <asm/mmu_context.h>
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- if (flags & MAP_FIXED)
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
-
- if (addr)
- addr = ALIGN(addr, huge_page_size(h));
-
- /* we need to make sure the colouring is OK */
- return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
-}
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 4525a9c68260..dfe2e5ad3b21 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -21,7 +21,7 @@
#include <linux/percpu.h>
#include <linux/module.h>
#include <asm/probes.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#ifdef CONFIG_KPROBES
#define __ARCH_WANT_KPROBES_INSN_SLOT
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/text-patching.h
index e7f14720f630..e7f14720f630 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/text-patching.h
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 2086fa6cdc25..103b6605dd68 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -13,7 +13,7 @@
#include <linux/io.h>
#include <linux/memblock.h>
#include <linux/of.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/kdump.h>
#include <asm/firmware.h>
#include <linux/uio.h>
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
index d4b8aff20815..247ab2acaccc 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -9,7 +9,7 @@
#include <linux/of_fdt.h>
#include <asm/epapr_hcalls.h>
#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/machdep.h>
#include <asm/inst.h>
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index 5277cf582c16..2659e1ac8604 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c
@@ -5,7 +5,7 @@
#include <linux/kernel.h>
#include <linux/jump_label.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/inst.h>
void arch_jump_label_transform(struct jump_entry *entry,
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 7a8bc03a00af..5081334b7bd2 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -21,7 +21,7 @@
#include <asm/processor.h>
#include <asm/machdep.h>
#include <asm/debug.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <linux/slab.h>
#include <asm/inst.h>
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index f8aa91bc3b17..9c85bbcc5201 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -21,7 +21,7 @@
#include <linux/slab.h>
#include <linux/set_memory.h>
#include <linux/execmem.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/cacheflush.h>
#include <asm/sstep.h>
#include <asm/sections.h>
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 816a63fd71fb..f930e3395a7f 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -18,7 +18,7 @@
#include <linux/bug.h>
#include <linux/sort.h>
#include <asm/setup.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
/* Count how many different relocations (different symbol, different
addend) */
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index e9bab599d0c2..135960918d14 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -17,7 +17,7 @@
#include <linux/kernel.h>
#include <asm/module.h>
#include <asm/firmware.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <linux/sort.h>
#include <asm/setup.h>
#include <asm/sections.h>
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index c0b351d61058..2e83702bf9ba 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -13,7 +13,7 @@
#include <asm/kprobes.h>
#include <asm/ptrace.h>
#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/sstep.h>
#include <asm/ppc-opcode.h>
#include <asm/inst.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ff61a3e7984c..7b739b9a91ab 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -54,7 +54,7 @@
#include <asm/firmware.h>
#include <asm/hw_irq.h>
#endif
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/exec.h>
#include <asm/livepatch.h>
#include <asm/cpu_has_feature.h>
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index 4856e1a5161c..fbb7ebd8aa08 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -14,7 +14,7 @@
#include <linux/debugfs.h>
#include <asm/asm-prototypes.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/security_features.h>
#include <asm/sections.h>
#include <asm/setup.h>
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index e515c1f7d8d3..75dbf3e0d9c4 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -40,7 +40,7 @@
#include <asm/time.h>
#include <asm/serial.h>
#include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/cpu_has_feature.h>
#include <asm/asm-prototypes.h>
#include <asm/kdump.h>
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 22f83fbbc762..3ebf5b9fbe98 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -60,7 +60,7 @@
#include <asm/xmon.h>
#include <asm/udbg.h>
#include <asm/kexec.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/opal.h>
#include <asm/cputhreads.h>
diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c
index 1502b7e439ca..7cfd0710e757 100644
--- a/arch/powerpc/kernel/static_call.c
+++ b/arch/powerpc/kernel/static_call.c
@@ -2,7 +2,7 @@
#include <linux/memory.h>
#include <linux/static_call.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
{
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index df41f4a7c738..afb7dcd47587 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -23,7 +23,7 @@
#include <linux/list.h>
#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/syscall.h>
#include <asm/inst.h>
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index d3c5552e4984..b3e0987ec7b0 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -23,7 +23,7 @@
#include <linux/list.h>
#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/syscall.h>
#include <asm/inst.h>
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index acdab294b340..af97fbb3c257 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -17,7 +17,7 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/inst.h>
static int __patch_mem(void *exec_addr, unsigned long val, void *patch_addr, bool is_dword)
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index b7201ba50b2e..587c8cf1230f 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -16,7 +16,7 @@
#include <linux/sched/mm.h>
#include <linux/stop_machine.h>
#include <asm/cputable.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/interrupt.h>
#include <asm/page.h>
#include <asm/sections.h>
diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c
index 8cd3b32f805b..1440d99630b3 100644
--- a/arch/powerpc/lib/test-code-patching.c
+++ b/arch/powerpc/lib/test-code-patching.c
@@ -6,7 +6,7 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr)
{
diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 23c7805fb7b3..66b5b4fa1686 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -11,7 +11,7 @@
#include <asm/cpu_has_feature.h>
#include <asm/sstep.h>
#include <asm/ppc-opcode.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/inst.h>
#define MAX_SUBTESTS 16
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 2db167f4233f..6978344edcb4 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -25,7 +25,7 @@
#include <asm/mmu.h>
#include <asm/machdep.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/sections.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index e1eadd03f133..47b22282269c 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -57,7 +57,7 @@
#include <asm/sections.h>
#include <asm/copro.h>
#include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/fadump.h>
#include <asm/firmware.h>
#include <asm/tm.h>
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index f2708c8629a5..6b783552403c 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -24,7 +24,7 @@
#include <linux/pgtable.h>
#include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include "internal.h"
diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c
index 87307d0fc3b8..bc9a39821d1c 100644
--- a/arch/powerpc/mm/book3s64/slice.c
+++ b/arch/powerpc/mm/book3s64/slice.c
@@ -633,6 +633,20 @@ return_addr:
}
EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
+#ifdef CONFIG_HUGETLB_PAGE
+static int file_to_psize(struct file *file)
+{
+ struct hstate *hstate = hstate_file(file);
+
+ return shift_to_mmu_psize(huge_page_shift(hstate));
+}
+#else
+static int file_to_psize(struct file *file)
+{
+ return 0;
+}
+#endif
+
unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long addr,
unsigned long len,
@@ -640,11 +654,17 @@ unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long flags,
vm_flags_t vm_flags)
{
+ unsigned int psize;
+
if (radix_enabled())
return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
- return slice_get_unmapped_area(addr, len, flags,
- mm_ctx_user_psize(&current->mm->context), 0);
+ if (filp && is_file_hugepages(filp))
+ psize = file_to_psize(filp);
+ else
+ psize = mm_ctx_user_psize(&current->mm->context);
+
+ return slice_get_unmapped_area(addr, len, flags, psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -654,11 +674,17 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long flags,
vm_flags_t vm_flags)
{
+ unsigned int psize;
+
if (radix_enabled())
return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags);
- return slice_get_unmapped_area(addr0, len, flags,
- mm_ctx_user_psize(&current->mm->context), 1);
+ if (filp && is_file_hugepages(filp))
+ psize = file_to_psize(filp);
+ else
+ psize = mm_ctx_user_psize(&current->mm->context);
+
+ return slice_get_unmapped_area(addr0, len, flags, psize, 1);
}
unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr)
@@ -788,20 +814,4 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start));
}
-
-static int file_to_psize(struct file *file)
-{
- struct hstate *hstate = hstate_file(file);
- return shift_to_mmu_psize(huge_page_shift(hstate));
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- if (radix_enabled())
- return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
-
- return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1);
-}
#endif
diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c
index aa9aa11927b2..03666d790a53 100644
--- a/arch/powerpc/mm/kasan/init_32.c
+++ b/arch/powerpc/mm/kasan/init_32.c
@@ -7,7 +7,7 @@
#include <linux/memblock.h>
#include <linux/sched/task.h>
#include <asm/pgalloc.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <mm/mmu_decl.h>
static pgprot_t __init kasan_prot_ro(void)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1221c561b43a..c7708c8fad29 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -26,7 +26,7 @@
#include <asm/svm.h>
#include <asm/mmzone.h>
#include <asm/ftrace.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/setup.h>
#include <asm/fixmap.h>
diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index 1beae802bb1c..6d10c6d8be71 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -24,7 +24,7 @@
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/smp.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index ad2a7c26f2a0..062e8785c1bb 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -10,7 +10,7 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/dma.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index b653a7be4cb1..0a650742f3a0 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -37,7 +37,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/cputhreads.h>
#include <asm/hugetlb.h>
#include <asm/paca.h>
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
index d26656b07b72..4f925adf2695 100644
--- a/arch/powerpc/mm/nohash/tlb_64e.c
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -24,7 +24,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/cputhreads.h>
#include <mm/mmu_decl.h>
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 7316396e452d..61df5aed7989 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -398,7 +398,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
*/
if (pmd_none(*pmd))
return;
- pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
BUG_ON(!pte);
assert_spin_locked(ptl);
pte_unmap(pte);
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 2a36cc2e7e9e..68c6a13e6acb 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -18,7 +18,7 @@
#include <linux/bpf.h>
#include <asm/kprobes.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include "bpf_jit.h"
diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c
index 308a2e40d7be..1d2972229e3a 100644
--- a/arch/powerpc/perf/8xx-pmu.c
+++ b/arch/powerpc/perf/8xx-pmu.c
@@ -14,7 +14,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/ptrace.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/inst.h>
#define PERF_8xx_ID_CPU_CYCLES 1
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index dc01aa604cc1..2b79171ee185 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -16,7 +16,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/ptrace.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/hw_irq.h>
#include <asm/interrupt.h>
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index e52b848b64b7..32fa5fb557c0 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -23,7 +23,7 @@
#include <asm/mpic.h>
#include <asm/cacheflush.h>
#include <asm/dbell.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/cputhreads.h>
#include <asm/fsl_pm.h>
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
index 8a7e55acf090..9be33e41af6d 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
@@ -12,7 +12,7 @@
#include <linux/delay.h>
#include <linux/pgtable.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/page.h>
#include <asm/pci-bridge.h>
#include <asm/mpic.h>
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index fee638fd8970..0e8f20ecca08 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -35,7 +35,7 @@
#include <asm/firmware.h>
#include <asm/rtas.h>
#include <asm/cputhreads.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include "interrupt.h"
#include <asm/udbg.h>
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index d21b681f52fb..09e7fe24fac1 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -35,7 +35,7 @@
#include <asm/ptrace.h>
#include <linux/atomic.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/irq.h>
#include <asm/page.h>
#include <asm/sections.h>
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index ad41dffe4d92..d98b933e4984 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -18,7 +18,7 @@
#include <asm/opal.h>
#include <asm/cputhreads.h>
#include <asm/cpuidle.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/smp.h>
#include <asm/runlatch.h>
#include <asm/dbell.h>
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 2e9da58195f5..8f41ef364fc6 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -28,7 +28,7 @@
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/runlatch.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/dbell.h>
#include <asm/kvm_ppc.h>
#include <asm/ppc-opcode.h>
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index c597711ef20a..db99725e752b 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -39,7 +39,7 @@
#include <asm/xive.h>
#include <asm/dbell.h>
#include <asm/plpar_wrappers.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/svm.h>
#include <asm/kvm_guest.h>
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e6cddbb2305f..e76e1d5d0611 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -50,7 +50,7 @@
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/firmware.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/sections.h>
#include <asm/inst.h>
#include <asm/interrupt.h>
diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c
index fc1a34faa5f3..dcc9d1ee5ffd 100644
--- a/arch/riscv/errata/andes/errata.c
+++ b/arch/riscv/errata/andes/errata.c
@@ -13,7 +13,7 @@
#include <asm/alternative.h>
#include <asm/cacheflush.h>
#include <asm/errata_list.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/processor.h>
#include <asm/sbi.h>
#include <asm/vendorid_list.h>
diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c
index cea3b96ade11..38aac2c47845 100644
--- a/arch/riscv/errata/sifive/errata.c
+++ b/arch/riscv/errata/sifive/errata.c
@@ -8,7 +8,7 @@
#include <linux/module.h>
#include <linux/string.h>
#include <linux/bug.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/vendorid_list.h>
#include <asm/errata_list.h>
diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
index f5120e07c318..e24770a77932 100644
--- a/arch/riscv/errata/thead/errata.c
+++ b/arch/riscv/errata/thead/errata.c
@@ -16,7 +16,7 @@
#include <asm/errata_list.h>
#include <asm/hwprobe.h>
#include <asm/io.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/vendorid_list.h>
#include <asm/vendor_extensions.h>
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index e79f15293492..5d7f3e8c2e50 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -963,6 +963,25 @@ void misc_mem_init(void);
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+/*
+ * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
+ * TLB flush will be required as a result of the "set". For example, use
+ * in scenarios where it is known ahead of time that the routine is
+ * setting non-present entries, or re-setting an existing entry to the
+ * same value. Otherwise, use the typical "set" helpers and flush the
+ * TLB.
+ */
+#define set_p4d_safe(p4dp, p4d) \
+({ \
+ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
+ set_p4d(p4dp, p4d); \
+})
+
+#define set_pgd_safe(pgdp, pgd) \
+({ \
+ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
+ set_pgd(pgdp, pgd); \
+})
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_RISCV_PGTABLE_H */
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index ab92fc84e1fc..ea263d3683ef 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -42,6 +42,7 @@ static inline int set_kernel_memory(char *startp, char *endp,
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
bool kernel_page_present(struct page *page);
#endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/text-patching.h
index 7228e266b9a1..7228e266b9a1 100644
--- a/arch/riscv/include/asm/patch.h
+++ b/arch/riscv/include/asm/text-patching.h
diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h
index 3fc7deda9190..5008f76cdc27 100644
--- a/arch/riscv/include/asm/uprobes.h
+++ b/arch/riscv/include/asm/uprobes.h
@@ -4,7 +4,7 @@
#define _ASM_RISCV_UPROBES_H
#include <asm/probes.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/bug.h>
#define MAX_UINSN_BYTES 8
diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c
index 0128b161bfda..7eb3cb1215c6 100644
--- a/arch/riscv/kernel/alternative.c
+++ b/arch/riscv/kernel/alternative.c
@@ -18,7 +18,7 @@
#include <asm/sbi.h>
#include <asm/csr.h>
#include <asm/insn.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
struct cpu_manufacturer_info_t {
unsigned long vendor_id;
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 3a8eeaa9310c..826f46b21f2e 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -20,7 +20,8 @@
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/hwcap.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
+#include <asm/hwprobe.h>
#include <asm/processor.h>
#include <asm/sbi.h>
#include <asm/vector.h>
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 5081ad886841..8cb9b211611d 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -10,7 +10,7 @@
#include <linux/memory.h>
#include <linux/stop_machine.h>
#include <asm/cacheflush.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE
void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex)
diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c
index 11ad789c60c6..6eee6f736f68 100644
--- a/arch/riscv/kernel/jump_label.c
+++ b/arch/riscv/kernel/jump_label.c
@@ -10,7 +10,7 @@
#include <linux/mutex.h>
#include <asm/bug.h>
#include <asm/cacheflush.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#define RISCV_INSN_NOP 0x00000013U
#define RISCV_INSN_JAL 0x0000006fU
diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
index 34ef522f07a8..db13c9ddf9e3 100644
--- a/arch/riscv/kernel/patch.c
+++ b/arch/riscv/kernel/patch.c
@@ -13,7 +13,7 @@
#include <asm/cacheflush.h>
#include <asm/fixmap.h>
#include <asm/ftrace.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/sections.h>
struct patch_insn {
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 474a65213657..380a0e8cecc0 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -12,7 +12,7 @@
#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/bug.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include "decode-insn.h"
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 271d01a5ba4d..d815448758a1 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -386,6 +386,21 @@ int set_direct_map_default_noflush(struct page *page)
PAGE_KERNEL, __pgprot(_PAGE_EXEC));
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ pgprot_t set, clear;
+
+ if (valid) {
+ set = PAGE_KERNEL;
+ clear = __pgprot(_PAGE_EXEC);
+ } else {
+ set = __pgprot(0);
+ clear = __pgprot(_PAGE_PRESENT);
+ }
+
+ return __set_memory((unsigned long)page_address(page), nr, set, clear);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
{
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 4cc631fa7039..ca60db75199d 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -10,7 +10,7 @@
#include <linux/filter.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/cfi.h>
#include <asm/percpu.h>
#include "bpf_jit.h"
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6de753c667f4..f8cd2f70a7fb 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -9,7 +9,7 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/memory.h>
-#include <asm/patch.h>
+#include <asm/text-patching.h>
#include <asm/cfi.h>
#include "bpf_jit.h"
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index cf1b5d6fb1a6..6f815d4ba0ca 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -12,39 +12,26 @@
#include <linux/pgtable.h>
#include <asm/page.h>
-#define hugetlb_free_pgd_range free_pgd_range
#define hugepages_supported() (MACHINE_HAS_EDAT1)
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned long sz);
void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
-pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+#define __HAVE_ARCH_HUGE_PTEP_GET
+extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
+extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
clear_bit(PG_arch_1, &folio->flags);
}
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
+#define __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
{
@@ -54,12 +41,14 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY));
}
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
return huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty)
@@ -72,6 +61,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
@@ -79,69 +69,36 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
__set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
}
-static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
-{
- return mk_pte(page, pgprot);
-}
-
+#define __HAVE_ARCH_HUGE_PTE_NONE
static inline int huge_pte_none(pte_t pte)
{
return pte_none(pte);
}
+#define __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY
static inline int huge_pte_none_mostly(pte_t pte)
{
return huge_pte_none(pte);
}
-static inline int huge_pte_write(pte_t pte)
-{
- return pte_write(pte);
-}
-
-static inline int huge_pte_dirty(pte_t pte)
-{
- return pte_dirty(pte);
-}
-
-static inline pte_t huge_pte_mkwrite(pte_t pte)
-{
- return pte_mkwrite_novma(pte);
-}
-
-static inline pte_t huge_pte_mkdirty(pte_t pte)
-{
- return pte_mkdirty(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
-static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
-{
- return pte_modify(pte, newprot);
-}
-
+#define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
{
return pte;
}
+#define __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
{
return pte;
}
+#define __HAVE_ARCH_HUGE_PTE_UFFD_WP
static inline int huge_pte_uffd_wp(pte_t pte)
{
return 0;
}
-static inline bool gigantic_page_runtime_supported(void)
-{
- return true;
-}
+#include <asm-generic/hugetlb.h>
#endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index b13a46e2e931..4f43cdd9835b 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -69,7 +69,7 @@ static inline void copy_page(void *to, void *from)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
- vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
/*
* These are used to make use of C type-checking..
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index cb4cc0f59012..94092f4ae764 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -62,6 +62,7 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
bool kernel_page_present(struct page *page);
#endif
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index ded0eff58a19..7c79cf1bc7d7 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -242,88 +242,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
else
return false;
}
-
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info = {};
-
- info.length = len;
- info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
- unsigned long addr0, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info = {};
- unsigned long addr;
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = PAGE_SIZE;
- info.high_limit = current->mm->mmap_base;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (addr & ~PAGE_MASK) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > TASK_SIZE - mmap_min_addr)
- return -ENOMEM;
-
- if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- goto check_asce_limit;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- goto check_asce_limit;
- }
-
- if (!test_bit(MMF_TOPDOWN, &mm->flags))
- addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
- else
- addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
- if (offset_in_page(addr))
- return addr;
-
-check_asce_limit:
- return check_asce_limit(mm, addr, len);
-}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 96efa061ce01..33f3504be90b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,6 +17,7 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
+#include <linux/hugetlb.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
@@ -73,6 +74,8 @@ static inline unsigned long mmap_base(unsigned long rnd,
static int get_align_mask(struct file *filp, unsigned long flags)
{
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
if (!(current->flags & PF_RANDOMIZE))
return 0;
if (filp || (flags & MAP_SHARED))
@@ -106,7 +109,8 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = get_align_mask(filp, flags);
- info.align_offset = pgoff << PAGE_SHIFT;
+ if (!(filp && is_file_hugepages(filp)))
+ info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
if (offset_in_page(addr))
return addr;
@@ -144,7 +148,8 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.align_mask = get_align_mask(filp, flags);
- info.align_offset = pgoff << PAGE_SHIFT;
+ if (!(filp && is_file_hugepages(filp)))
+ info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
/*
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 4a0f422cfeb6..8f56a21a077f 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -407,6 +407,18 @@ int set_direct_map_default_noflush(struct page *page)
return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ unsigned long flags;
+
+ if (valid)
+ flags = SET_MEMORY_DEF;
+ else
+ flags = SET_MEMORY_INV;
+
+ return __set_memory((unsigned long)page_to_virt(page), nr, flags);
+}
+
bool kernel_page_present(struct page *page)
{
unsigned long addr;
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index fc44d9c88b41..4d3f10ed8275 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -3,3 +3,4 @@ generated-y += syscall_table.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += parport.h
+generic-y += text-patching.h
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 75028bd568ba..4a92e6e4d627 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -5,21 +5,6 @@
#include <asm/cacheflush.h>
#include <asm/page.h>
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- if (len & ~HPAGE_MASK)
- return -EINVAL;
- if (addr & ~HPAGE_MASK)
- return -EINVAL;
- return 0;
-}
-
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 43b0ae4c2c21..17ee8a273aa6 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -4,3 +4,4 @@ generated-y += syscall_table_64.h
generic-y += agp.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += text-patching.h
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 80822f922e76..fb31bc0c5b48 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -23,6 +23,7 @@
#include <linux/utsname.h>
#include <linux/smp.h>
#include <linux/ipc.h>
+#include <linux/hugetlb.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -42,12 +43,16 @@ SYSCALL_DEFINE0(getpagesize)
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct vm_unmapped_area_info info = {};
+ bool file_hugepage = false;
+
+ if (filp && is_file_hugepages(filp))
+ file_hugepage = true;
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
- if ((flags & MAP_SHARED) &&
+ if (!file_hugepage && (flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
return -EINVAL;
return addr;
@@ -62,9 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
info.length = len;
info.low_limit = addr;
info.high_limit = TASK_SIZE;
- info.align_mask = (flags & MAP_SHARED) ?
- (PAGE_MASK & (SHMLBA - 1)) : 0;
- info.align_offset = pgoff << PAGE_SHIFT;
+ if (!file_hugepage) {
+ info.align_mask = (flags & MAP_SHARED) ?
+ (PAGE_MASK & (SHMLBA - 1)) : 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ } else {
+ info.align_mask = huge_page_mask_align(filp);
+ }
return vm_unmapped_area(&info);
}
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index acade309dc2f..c5a284df7b41 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -30,6 +30,7 @@
#include <linux/context_tracking.h>
#include <linux/timex.h>
#include <linux/uaccess.h>
+#include <linux/hugetlb.h>
#include <asm/utrap.h>
#include <asm/unistd.h>
@@ -87,6 +88,16 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr,
return base + off;
}
+static unsigned long get_align_mask(struct file *filp, unsigned long flags)
+{
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
+ if (filp || (flags & MAP_SHARED))
+ return PAGE_MASK & (SHMLBA - 1);
+
+ return 0;
+}
+
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
@@ -94,12 +105,16 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
unsigned long task_size = TASK_SIZE;
int do_color_align;
struct vm_unmapped_area_info info = {};
+ bool file_hugepage = false;
+
+ if (filp && is_file_hugepages(filp))
+ file_hugepage = true;
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
- if ((flags & MAP_SHARED) &&
+ if (!file_hugepage && (flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
return -EINVAL;
return addr;
@@ -111,7 +126,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
return -ENOMEM;
do_color_align = 0;
- if (filp || (flags & MAP_SHARED))
+ if ((filp || (flags & MAP_SHARED)) && !file_hugepage)
do_color_align = 1;
if (addr) {
@@ -129,8 +144,9 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = min(task_size, VA_EXCLUDE_START);
- info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
- info.align_offset = pgoff << PAGE_SHIFT;
+ info.align_mask = get_align_mask(filp, flags);
+ if (!file_hugepage)
+ info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
@@ -154,15 +170,19 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long addr = addr0;
int do_color_align;
struct vm_unmapped_area_info info = {};
+ bool file_hugepage = false;
/* This should only ever run for 32-bit processes. */
BUG_ON(!test_thread_flag(TIF_32BIT));
+ if (filp && is_file_hugepages(filp))
+ file_hugepage = true;
+
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
- if ((flags & MAP_SHARED) &&
+ if (!file_hugepage && (flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
return -EINVAL;
return addr;
@@ -172,7 +192,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return -ENOMEM;
do_color_align = 0;
- if (filp || (flags & MAP_SHARED))
+ if ((filp || (flags & MAP_SHARED)) && !file_hugepage)
do_color_align = 1;
/* requesting a specific address */
@@ -192,8 +212,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
- info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
- info.align_offset = pgoff << PAGE_SHIFT;
+ info.align_mask = get_align_mask(filp, flags);
+ if (!file_hugepage)
+ info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
/*
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index cc91ca7a1e18..eee601a0d2cf 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -19,114 +19,6 @@
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
-/* Slightly simplified from the non-hugepage variant because by
- * definition we don't have to worry about any page coloring stuff
- */
-
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
- unsigned long addr,
- unsigned long len,
- unsigned long pgoff,
- unsigned long flags)
-{
- struct hstate *h = hstate_file(filp);
- unsigned long task_size = TASK_SIZE;
- struct vm_unmapped_area_info info = {};
-
- if (test_thread_flag(TIF_32BIT))
- task_size = STACK_TOP32;
-
- info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = min(task_size, VA_EXCLUDE_START);
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- addr = vm_unmapped_area(&info);
-
- if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
- VM_BUG_ON(addr != -ENOMEM);
- info.low_limit = VA_EXCLUDE_END;
- info.high_limit = task_size;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-static unsigned long
-hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len,
- const unsigned long pgoff,
- const unsigned long flags)
-{
- struct hstate *h = hstate_file(filp);
- struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
- struct vm_unmapped_area_info info = {};
-
- /* This should only ever run for 32-bit processes. */
- BUG_ON(!test_thread_flag(TIF_32BIT));
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = PAGE_SIZE;
- info.high_limit = mm->mmap_base;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (addr & ~PAGE_MASK) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = STACK_TOP32;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long task_size = TASK_SIZE;
-
- if (test_thread_flag(TIF_32BIT))
- task_size = STACK_TOP32;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > task_size)
- return -ENOMEM;
-
- if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (task_size - len >= addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
- if (!test_bit(MMF_TOPDOWN, &mm->flags))
- return hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
- else
- return hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
-}
static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
{
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 8e594cda6d77..e8e8b54b3037 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -435,24 +435,25 @@ void __init arch_cpu_finalize_init(void)
os_check_bugs();
}
-void apply_seal_endbr(s32 *start, s32 *end)
+void apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
{
}
-void apply_retpolines(s32 *start, s32 *end)
+void apply_retpolines(s32 *start, s32 *end, struct module *mod)
{
}
-void apply_returns(s32 *start, s32 *end)
+void apply_returns(s32 *start, s32 *end, struct module *mod)
{
}
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
}
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
+ struct module *mod)
{
}
@@ -468,6 +469,11 @@ void *text_poke(void *addr, const void *opcode, size_t len)
return memcpy(addr, opcode, len);
}
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ return text_poke(addr, opcode, len);
+}
+
void text_poke_sync(void)
{
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 948707a3615e..6c633d93c639 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -83,6 +83,7 @@ config X86
select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_EXECMEM_ROX if X86_64
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index bfc7cabf4017..39e6efc1a9ca 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -48,7 +48,8 @@ int __init init_vdso_image(const struct vdso_image *image)
apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt +
- image->alt_len));
+ image->alt_len),
+ NULL);
return 0;
}
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index ca9ae606aab9..dc03a647776d 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -96,16 +96,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
* instructions were patched in already:
*/
extern int alternatives_patched;
+struct module;
extern void alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
-extern void apply_retpolines(s32 *start, s32 *end);
-extern void apply_returns(s32 *start, s32 *end);
-extern void apply_seal_endbr(s32 *start, s32 *end);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
+ struct module *mod);
+extern void apply_retpolines(s32 *start, s32 *end, struct module *mod);
+extern void apply_returns(s32 *start, s32 *end, struct module *mod);
+extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod);
extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
- s32 *start_cfi, s32 *end_cfi);
-
-struct module;
+ s32 *start_cfi, s32 *end_cfi, struct module *mod);
struct callthunk_sites {
s32 *call_start, *call_end;
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 1b93ff80b43b..c9fe207916f4 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
}
#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
- vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index f3d257c45225..d63576608ce7 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,7 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
-extern unsigned long physmem_end;
+extern unsigned long direct_map_physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index c55a79d5feae..e525cd85f999 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -234,9 +234,10 @@ do { \
*/
#define percpu_add_op(size, qual, var, val) \
do { \
- const int pao_ID__ = (__builtin_constant_p(val) && \
- ((val) == 1 || (val) == -1)) ? \
- (int)(val) : 0; \
+ const int pao_ID__ = \
+ (__builtin_constant_p(val) && \
+ ((val) == 1 || \
+ (val) == (typeof(val))-1)) ? (int)(val) : 0; \
\
if (0) { \
typeof(var) pao_tmp__; \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4c2d080d26b4..593f10aabd45 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1775,6 +1775,43 @@ bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif
+/*
+ * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
+ * TLB flush will be required as a result of the "set". For example, use
+ * in scenarios where it is known ahead of time that the routine is
+ * setting non-present entries, or re-setting an existing entry to the
+ * same value. Otherwise, use the typical "set" helpers and flush the
+ * TLB.
+ */
+#define set_pte_safe(ptep, pte) \
+({ \
+ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
+ set_pte(ptep, pte); \
+})
+
+#define set_pmd_safe(pmdp, pmd) \
+({ \
+ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
+ set_pmd(pmdp, pmd); \
+})
+
+#define set_pud_safe(pudp, pud) \
+({ \
+ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
+ set_pud(pudp, pud); \
+})
+
+#define set_p4d_safe(p4dp, p4d) \
+({ \
+ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
+ set_p4d(p4dp, p4d); \
+})
+
+#define set_pgd_safe(pgdp, pgd) \
+({ \
+ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
+ set_pgd(pgdp, pgd); \
+})
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index a98e53491a4e..ec68f8369bdc 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -141,7 +141,7 @@ extern unsigned int ptrs_per_p4d;
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
#ifdef CONFIG_RANDOMIZE_MEMORY
-# define PHYSMEM_END physmem_end
+# define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end
#endif
/*
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 4b2abce2e3e7..cc62ef70ccc0 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -89,6 +89,7 @@ int set_pages_rw(struct page *page, int numpages);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
bool kernel_page_present(struct page *page);
extern int kernel_set_to_readonly;
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 6259f1937fe7..ab9e143ec9fe 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -35,6 +35,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+#define text_poke_copy text_poke_copy
extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
extern void *text_poke_set(void *addr, int c, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d17518ca19b8..243843e44e89 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -392,8 +392,10 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
-static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
+ struct module *mod)
{
+ u8 *wr_instr = module_writable_address(mod, instr);
void *target, *bug = &BUG_func;
s32 disp;
@@ -403,14 +405,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
}
if (a->instrlen != 6 ||
- instr[0] != CALL_RIP_REL_OPCODE ||
- instr[1] != CALL_RIP_REL_MODRM) {
+ wr_instr[0] != CALL_RIP_REL_OPCODE ||
+ wr_instr[1] != CALL_RIP_REL_MODRM) {
pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
BUG();
}
/* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
- disp = *(s32 *)(instr + 2);
+ disp = *(s32 *)(wr_instr + 2);
#ifdef CONFIG_X86_64
/* ff 15 00 00 00 00 call *0x0(%rip) */
/* target address is stored at "next instruction + disp". */
@@ -448,7 +450,8 @@ static inline u8 * instr_va(struct alt_instr *i)
* to refetch changed I$ lines.
*/
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+ struct alt_instr *end,
+ struct module *mod)
{
u8 insn_buff[MAX_PATCH_LEN];
u8 *instr, *replacement;
@@ -477,6 +480,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
+ u8 *wr_instr, *wr_replacement;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@@ -490,7 +494,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
}
instr = instr_va(a);
+ wr_instr = module_writable_address(mod, instr);
+
replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ wr_replacement = module_writable_address(mod, replacement);
+
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
@@ -501,9 +509,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- memcpy(insn_buff, instr, a->instrlen);
+ memcpy(insn_buff, wr_instr, a->instrlen);
optimize_nops(instr, insn_buff, a->instrlen);
- text_poke_early(instr, insn_buff, a->instrlen);
+ text_poke_early(wr_instr, insn_buff, a->instrlen);
continue;
}
@@ -513,11 +521,12 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
instr, instr, a->instrlen,
replacement, a->replacementlen, a->flags);
- memcpy(insn_buff, replacement, a->replacementlen);
+ memcpy(insn_buff, wr_replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
if (a->flags & ALT_FLAG_DIRECT_CALL) {
- insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a,
+ mod);
if (insn_buff_sz < 0)
continue;
}
@@ -527,11 +536,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
- DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
- text_poke_early(instr, insn_buff, insn_buff_sz);
+ text_poke_early(wr_instr, insn_buff, insn_buff_sz);
}
kasan_enable_current();
@@ -722,18 +731,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
/*
* Generated by 'objtool --retpoline'.
*/
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
+ struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op1, op2;
- ret = insn_decode_kernel(&insn, addr);
+ ret = insn_decode_kernel(&insn, wr_addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -761,9 +772,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
optimize_nops(addr, bytes, len);
- DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(addr, bytes, len);
+ text_poke_early(wr_addr, bytes, len);
}
}
}
@@ -799,7 +810,8 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
return i;
}
-void __init_or_module noinline apply_returns(s32 *start, s32 *end)
+void __init_or_module noinline apply_returns(s32 *start, s32 *end,
+ struct module *mod)
{
s32 *s;
@@ -808,12 +820,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op;
- ret = insn_decode_kernel(&insn, addr);
+ ret = insn_decode_kernel(&insn, wr_addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -833,32 +846,35 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
len = patch_return(addr, &insn, bytes);
if (len == insn.length) {
- DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr);
DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(addr, bytes, len);
+ text_poke_early(wr_addr, bytes, len);
}
}
}
#else
-void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end,
+ struct module *mod) { }
#endif /* CONFIG_MITIGATION_RETHUNK */
#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
-void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
+ struct module *mod) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end,
+ struct module *mod) { }
#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr);
+static void poison_cfi(void *addr, void *wr_addr);
-static void __init_or_module poison_endbr(void *addr, bool warn)
+static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
{
u32 endbr, poison = gen_endbr_poison();
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr)))
return;
if (!is_endbr(endbr)) {
@@ -873,7 +889,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn)
*/
DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
- text_poke_early(addr, &poison, 4);
+ text_poke_early(wr_addr, &poison, 4);
}
/*
@@ -882,22 +898,23 @@ static void __init_or_module poison_endbr(void *addr, bool warn)
* Seal the functions for indirect calls by clobbering the ENDBR instructions
* and the kCFI hash value.
*/
-void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr, true);
+ poison_endbr(addr, wr_addr, true);
if (IS_ENABLED(CONFIG_FINEIBT))
- poison_cfi(addr - 16);
+ poison_cfi(addr - 16, wr_addr - 16);
}
}
#else
-void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { }
#endif /* CONFIG_X86_KERNEL_IBT */
@@ -1119,7 +1136,7 @@ static u32 decode_caller_hash(void *addr)
}
/* .retpoline_sites */
-static int cfi_disable_callers(s32 *start, s32 *end)
+static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
{
/*
* Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
@@ -1131,20 +1148,23 @@ static int cfi_disable_callers(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
+
if (!hash) /* nocfi callers */
continue;
- text_poke_early(addr, jmp, 2);
+ text_poke_early(wr_addr, jmp, 2);
}
return 0;
}
-static int cfi_enable_callers(s32 *start, s32 *end)
+static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
{
/*
* Re-enable kCFI, undo what cfi_disable_callers() did.
@@ -1154,106 +1174,115 @@ static int cfi_enable_callers(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
if (!hash) /* nocfi callers */
continue;
- text_poke_early(addr, mov, 2);
+ text_poke_early(wr_addr, mov, 2);
}
return 0;
}
/* .cfi_sites */
-static int cfi_rand_preamble(s32 *start, s32 *end)
+static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
u32 hash;
- hash = decode_preamble_hash(addr);
+ hash = decode_preamble_hash(wr_addr);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
hash = cfi_rehash(hash);
- text_poke_early(addr + 1, &hash, 4);
+ text_poke_early(wr_addr + 1, &hash, 4);
}
return 0;
}
-static int cfi_rewrite_preamble(s32 *start, s32 *end)
+static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
u32 hash;
- hash = decode_preamble_hash(addr);
+ hash = decode_preamble_hash(wr_addr);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
- text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
- WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
- text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+ text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4);
}
return 0;
}
-static void cfi_rewrite_endbr(s32 *start, s32 *end)
+static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr+16, false);
+ poison_endbr(addr + 16, wr_addr + 16, false);
}
}
/* .retpoline_sites */
-static int cfi_rand_callers(s32 *start, s32 *end)
+static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
if (hash) {
hash = -cfi_rehash(hash);
- text_poke_early(addr + 2, &hash, 4);
+ text_poke_early(wr_addr + 2, &hash, 4);
}
}
return 0;
}
-static int cfi_rewrite_callers(s32 *start, s32 *end)
+static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
if (hash) {
- text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
- WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
- text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4);
}
/* rely on apply_retpolines() */
}
@@ -1262,8 +1291,9 @@ static int cfi_rewrite_callers(s32 *start, s32 *end)
}
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, bool builtin)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
+ bool builtin = mod ? false : true;
int ret;
if (WARN_ONCE(fineibt_preamble_size != 16,
@@ -1281,7 +1311,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
* rewrite them. This disables all CFI. If this succeeds but any of the
* later stages fails, we're without CFI.
*/
- ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ ret = cfi_disable_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
@@ -1292,11 +1322,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
}
- ret = cfi_rand_preamble(start_cfi, end_cfi);
+ ret = cfi_rand_preamble(start_cfi, end_cfi, mod);
if (ret)
goto err;
- ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ ret = cfi_rand_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
}
@@ -1308,7 +1338,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
return;
case CFI_KCFI:
- ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ ret = cfi_enable_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
@@ -1318,17 +1348,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
case CFI_FINEIBT:
/* place the FineIBT preamble at func()-16 */
- ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod);
if (ret)
goto err;
/* rewrite the callers to target func()-16 */
- ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
/* now that nobody targets func()+0, remove ENDBR there */
- cfi_rewrite_endbr(start_cfi, end_cfi);
+ cfi_rewrite_endbr(start_cfi, end_cfi, mod);
if (builtin)
pr_info("Using FineIBT CFI\n");
@@ -1347,7 +1377,7 @@ static inline void poison_hash(void *addr)
*(u32 *)addr = 0;
}
-static void poison_cfi(void *addr)
+static void poison_cfi(void *addr, void *wr_addr)
{
switch (cfi_mode) {
case CFI_FINEIBT:
@@ -1359,8 +1389,8 @@ static void poison_cfi(void *addr)
* ud2
* 1: nop
*/
- poison_endbr(addr, false);
- poison_hash(addr + fineibt_preamble_hash);
+ poison_endbr(addr, wr_addr, false);
+ poison_hash(wr_addr + fineibt_preamble_hash);
break;
case CFI_KCFI:
@@ -1369,7 +1399,7 @@ static void poison_cfi(void *addr)
* movl $0, %eax
* .skip 11, 0x90
*/
- poison_hash(addr + 1);
+ poison_hash(wr_addr + 1);
break;
default:
@@ -1380,22 +1410,21 @@ static void poison_cfi(void *addr)
#else
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, bool builtin)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
}
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr) { }
+static void poison_cfi(void *addr, void *wr_addr) { }
#endif
#endif
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
return __apply_fineibt(start_retpoline, end_retpoline,
- start_cfi, end_cfi,
- /* .builtin = */ false);
+ start_cfi, end_cfi, mod);
}
#ifdef CONFIG_SMP
@@ -1692,16 +1721,16 @@ void __init alternative_instructions(void)
paravirt_set_cap();
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
- __cfi_sites, __cfi_sites_end, true);
+ __cfi_sites, __cfi_sites_end, NULL);
/*
* Rewrite the retpolines, must be done before alternatives since
* those can rewrite the retpoline thunks.
*/
- apply_retpolines(__retpoline_sites, __retpoline_sites_end);
- apply_returns(__return_sites, __return_sites_end);
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
+ apply_returns(__return_sites, __return_sites_end, NULL);
- apply_alternatives(__alt_instructions, __alt_instructions_end);
+ apply_alternatives(__alt_instructions, __alt_instructions_end, NULL);
/*
* Now all calls are established. Apply the call thunks if
@@ -1712,7 +1741,7 @@ void __init alternative_instructions(void)
/*
* Seal all functions that do not have their address taken.
*/
- apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL);
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index adb09f78edb2..4dd0ad6c94d6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code,
return ret;
/* replace the text with the new text */
- if (ftrace_poke_late)
+ if (ftrace_poke_late) {
text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
- else
- text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ } else {
+ mutex_lock(&text_mutex);
+ text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+ }
return 0;
}
@@ -318,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
union ftrace_op_code_union op_ptr;
- int ret;
+ void *ret;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
start_offset = (unsigned long)ftrace_regs_caller;
@@ -349,15 +352,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
- ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
- if (WARN_ON(ret < 0))
+ ret = text_poke_copy(trampoline, (void *)start_offset, size);
+ if (WARN_ON(!ret))
goto fail;
ip = trampoline + size;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
- memcpy(ip, retq, sizeof(retq));
+ text_poke_copy(ip, retq, sizeof(retq));
/* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
@@ -365,8 +368,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + (jmp_offset - start_offset);
if (WARN_ON(*(char *)ip != 0x75))
goto fail;
- ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
- if (ret < 0)
+ if (!text_poke_copy(ip, x86_nops[2], 2))
goto fail;
}
@@ -379,7 +381,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
*/
ptr = (unsigned long *)(trampoline + size + RET_SIZE);
- *ptr = (unsigned long)ops;
+ text_poke_copy(ptr, &ops, sizeof(unsigned long));
op_offset -= start_offset;
memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
@@ -395,7 +397,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
op_ptr.offset = offset;
/* put in the new offset to the ftrace_ops */
- memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
/* put in the call to the function */
mutex_lock(&text_mutex);
@@ -405,9 +407,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
* the depth accounting before the call already.
*/
dest = ftrace_ops_get_func(ops);
- memcpy(trampoline + call_offset,
- text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
- CALL_INSN_SIZE);
+ text_poke_copy_locked(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE, false);
mutex_unlock(&text_mutex);
/* ALLOC_TRAMP flags lets us know we created it */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 837450b6e882..8984abd91c00 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -146,18 +146,21 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
}
if (apply) {
- if (memcmp(loc, &zero, size)) {
+ void *wr_loc = module_writable_address(me, loc);
+
+ if (memcmp(wr_loc, &zero, size)) {
pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
- write(loc, &val, size);
+ write(wr_loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
+ /* FIXME: needs care for ROX module allocations */
write(loc, &zero, size);
}
}
@@ -224,7 +227,7 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *alt = NULL, *locks = NULL,
+ const Elf_Shdr *s, *alt = NULL,
*orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
@@ -233,8 +236,6 @@ int module_finalize(const Elf_Ehdr *hdr,
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -265,20 +266,20 @@ int module_finalize(const Elf_Ehdr *hdr,
csize = cfi->sh_size;
}
- apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me);
}
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
- apply_retpolines(rseg, rseg + retpolines->sh_size);
+ apply_retpolines(rseg, rseg + retpolines->sh_size, me);
}
if (returns) {
void *rseg = (void *)returns->sh_addr;
- apply_returns(rseg, rseg + returns->sh_size);
+ apply_returns(rseg, rseg + returns->sh_size, me);
}
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size);
+ apply_alternatives(aseg, aseg + alt->sh_size, me);
}
if (calls || alt) {
struct callthunk_sites cs = {};
@@ -297,8 +298,28 @@ int module_finalize(const Elf_Ehdr *hdr,
}
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
- apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me);
}
+
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
+ return 0;
+}
+
+int module_post_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ const Elf_Shdr *s, *locks = NULL;
+ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks = s;
+ }
+
if (locks) {
void *lseg = (void *)locks->sh_addr;
void *text = me->mem[MOD_TEXT].base;
@@ -308,10 +329,6 @@ int module_finalize(const Elf_Ehdr *hdr,
text, text_end);
}
- if (orc && orc_ip)
- unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
- (void *)orc->sh_addr, orc->sh_size);
-
return 0;
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 87f8c9a71c49..776ae6fa7f2d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,6 +18,7 @@
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/elf.h>
+#include <linux/hugetlb.h>
#include <asm/elf.h>
#include <asm/ia32.h>
@@ -25,8 +26,10 @@
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
*/
-static unsigned long get_align_mask(void)
+static unsigned long get_align_mask(struct file *filp)
{
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
/* handle 32- and 64-bit case with a single conditional */
if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
return 0;
@@ -49,7 +52,7 @@ static unsigned long get_align_mask(void)
*/
static unsigned long get_align_bits(void)
{
- return va_align.bits & get_align_mask();
+ return va_align.bits & get_align_mask(NULL);
}
static int __init control_va_addr_alignment(char *str)
@@ -148,12 +151,15 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len,
info.length = len;
info.low_limit = begin;
info.high_limit = end;
- info.align_offset = pgoff << PAGE_SHIFT;
- info.start_gap = stack_guard_placement(vm_flags);
+ if (!(filp && is_file_hugepages(filp))) {
+ info.align_offset = pgoff << PAGE_SHIFT;
+ info.start_gap = stack_guard_placement(vm_flags);
+ }
if (filp) {
- info.align_mask = get_align_mask();
+ info.align_mask = get_align_mask(filp);
info.align_offset += get_align_bits();
}
+
return vm_unmapped_area(&info);
}
@@ -199,7 +205,10 @@ get_unmapped_area:
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
- info.start_gap = stack_guard_placement(vm_flags);
+ if (!(filp && is_file_hugepages(filp))) {
+ info.start_gap = stack_guard_placement(vm_flags);
+ info.align_offset = pgoff << PAGE_SHIFT;
+ }
/*
* If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
@@ -211,9 +220,8 @@ get_unmapped_area:
if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
- info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
- info.align_mask = get_align_mask();
+ info.align_mask = get_align_mask(filp);
info.align_offset += get_align_bits();
}
addr = vm_unmapped_area(&info);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 807a5859a3c4..58f7f2bd535d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,107 +19,6 @@
#include <asm/tlbflush.h>
#include <asm/elf.h>
-#ifdef CONFIG_HUGETLB_PAGE
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info = {};
-
- info.length = len;
- info.low_limit = get_mmap_base(1);
-
- /*
- * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
- * in the full address space.
- */
- info.high_limit = in_32bit_syscall() ?
- task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
-
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- return vm_unmapped_area(&info);
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info = {};
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = PAGE_SIZE;
- info.high_limit = get_mmap_base(0);
-
- /*
- * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
- * in the full address space.
- */
- if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
- info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
-
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (addr & ~PAGE_MASK) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE_LOW;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
-
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- /* No address checking. See comment at mmap_address_hint_valid() */
- if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr &= huge_page_mask(h);
- if (!mmap_address_hint_valid(addr, len))
- goto get_unmapped_area;
-
- vma = find_vma(mm, addr);
- if (!vma || addr + len <= vm_start_gap(vma))
- return addr;
- }
-
-get_unmapped_area:
- if (!test_bit(MMF_TOPDOWN, &mm->flags))
- return hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
- else
- return hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
-}
-#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
bool __init arch_hugetlb_valid_size(unsigned long size)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 101725c149c4..c6d29f283001 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1058,18 +1058,53 @@ unsigned long arch_max_swapfile_size(void)
#ifdef CONFIG_EXECMEM
static struct execmem_info execmem_info __ro_after_init;
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable)
+{
+ /* fill memory with INT3 instructions */
+ if (writeable)
+ memset(ptr, INT3_INSN_OPCODE, size);
+ else
+ text_poke_set(ptr, INT3_INSN_OPCODE, size);
+}
+#endif
+
struct execmem_info __init *execmem_arch_setup(void)
{
unsigned long start, offset = 0;
+ enum execmem_range_flags flags;
+ pgprot_t pgprot;
if (kaslr_enabled())
offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
start = MODULES_VADDR + offset;
+ if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) {
+ pgprot = PAGE_KERNEL_ROX;
+ flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE;
+ } else {
+ pgprot = PAGE_KERNEL;
+ flags = EXECMEM_KASAN_SHADOW;
+ }
+
execmem_info = (struct execmem_info){
.ranges = {
- [EXECMEM_DEFAULT] = {
+ [EXECMEM_MODULE_TEXT] = {
+ .flags = flags,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = pgprot,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_KPROBES ... EXECMEM_BPF] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ [EXECMEM_MODULE_DATA] = {
.flags = EXECMEM_KASAN_SHADOW,
.start = start,
.end = MODULES_END,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ff253648706f..01ea7c6df303 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -961,7 +961,7 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
- if (WARN_ON_ONCE(end > PHYSMEM_END))
+ if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END))
return -ERANGE;
ret = __add_pages(nid, start_pfn, nr_pages, params);
@@ -985,22 +985,32 @@ int arch_add_memory(int nid, u64 start, u64 size,
return add_pages(nid, start_pfn, nr_pages, params);
}
-static void __meminit free_pagetable(struct page *page, int order)
+static void free_reserved_pages(struct page *page, unsigned long nr_pages)
{
- unsigned long magic;
- unsigned int nr_pages = 1 << order;
+ while (nr_pages--)
+ free_reserved_page(page++);
+}
+static void __meminit free_pagetable(struct page *page, int order)
+{
/* bootmem page has reserved flag */
if (PageReserved(page)) {
- magic = page->index;
- if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ unsigned long nr_pages = 1 << order;
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+ enum bootmem_type type = bootmem_type(page);
+
+ if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
- } else
- while (nr_pages--)
- free_reserved_page(page++);
- } else
+ } else {
+ free_reserved_pages(page, nr_pages);
+ }
+#else
+ free_reserved_pages(page, nr_pages);
+#endif
+ } else {
free_pages((unsigned long)page_address(page), order);
+ }
}
static void __meminit free_hugepage_table(struct page *page,
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index e17e6e27b7ec..11a93542d198 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region {
} kaslr_regions[] = {
{
.base = &page_offset_base,
- .end = &physmem_end,
+ .end = &direct_map_physmem_end,
},
{
.base = &vmalloc_base,
@@ -62,8 +62,12 @@ static __initdata struct kaslr_memory_region {
},
};
-/* The end of the possible address space for physical memory */
-unsigned long physmem_end __ro_after_init;
+/*
+ * The end of the physical address space that can be mapped directly by the
+ * kernel. This starts out at (1<<MAX_PHYSMEM_BITS) - 1), but KASLR may reduce
+ * that in order to increase the available entropy for mapping other regions.
+ */
+unsigned long direct_map_physmem_end __ro_after_init;
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
@@ -94,7 +98,7 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
/* Preset the end of the possible address space for physical memory */
- physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
+ direct_map_physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -145,7 +149,7 @@ void __init kernel_randomize_memory(void)
vaddr += get_padding(&kaslr_regions[i]);
/*
* KASLR trims the maximum possible size of the
- * direct-map. Update the physmem_end boundary.
+ * direct-map. Update the direct_map_physmem_end boundary.
* No rounding required as the region starts
* PUD aligned and size is in units of TB.
*/
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 44f7b2ea6a07..069e421c2247 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2444,6 +2444,14 @@ int set_direct_map_default_noflush(struct page *page)
return __set_pages_p(page, 1);
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ if (valid)
+ return __set_pages_p(page, nr);
+
+ return __set_pages_np(page, nr);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index fa07c686cbcc..cc5dba738389 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -8,3 +8,4 @@ generic-y += parport.h
generic-y += qrwlock.h
generic-y += qspinlock.h
generic-y += user.h
+generic-y += text-patching.h
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 1ff0c858544f..99d4ccee7f6e 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -113,6 +113,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
+#define MADV_GUARD_REMOVE 103 /* unguard range */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index b3acbc4174fb..a738e7745865 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1047,7 +1047,7 @@ void binder_alloc_vma_close(struct binder_alloc *alloc)
/**
* binder_alloc_free_page() - shrinker callback to free pages
* @item: item to free
- * @lock: lock protecting the item
+ * @lru: list_lru instance of the item
* @cb_arg: callback argument
*
* Called from list_lru_walk() in binder_shrink_scan() to free
@@ -1055,9 +1055,8 @@ void binder_alloc_vma_close(struct binder_alloc *alloc)
*/
enum lru_status binder_alloc_free_page(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lock,
void *cb_arg)
- __must_hold(lock)
+ __must_hold(&lru->lock)
{
struct binder_lru_page *page = container_of(item, typeof(*page), lru);
struct binder_alloc *alloc = page->alloc;
@@ -1092,7 +1091,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
list_lru_isolate(lru, item);
spin_unlock(&alloc->lock);
- spin_unlock(lock);
+ spin_unlock(&lru->lock);
if (vma) {
trace_binder_unmap_user_start(alloc, index);
@@ -1106,7 +1105,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
mmput_async(mm);
__free_page(page_to_free);
- spin_lock(lock);
return LRU_REMOVED_RETRY;
err_invalid_vma:
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 70387234477e..c02c8ebcb466 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -118,7 +118,7 @@ static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
#endif
enum lru_status binder_alloc_free_page(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lock, void *cb_arg);
+ void *cb_arg);
struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
size_t data_size,
size_t offsets_size,
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 6aea609b795c..402b7b175863 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -94,6 +94,7 @@ endchoice
config ZRAM_DEF_COMP
string
+ depends on ZRAM
default "lzo-rle" if ZRAM_DEF_COMP_LZORLE
default "lzo" if ZRAM_DEF_COMP_LZO
default "lz4" if ZRAM_DEF_COMP_LZ4
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ad9c9bc3ccfc..3dee026988dc 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -178,11 +178,105 @@ static inline u32 zram_get_priority(struct zram *zram, u32 index)
static void zram_accessed(struct zram *zram, u32 index)
{
zram_clear_flag(zram, index, ZRAM_IDLE);
+ zram_clear_flag(zram, index, ZRAM_PP_SLOT);
#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
zram->table[index].ac_time = ktime_get_boottime();
#endif
}
+#if defined CONFIG_ZRAM_WRITEBACK || defined CONFIG_ZRAM_MULTI_COMP
+struct zram_pp_slot {
+ unsigned long index;
+ struct list_head entry;
+};
+
+/*
+ * A post-processing bucket is, essentially, a size class, this defines
+ * the range (in bytes) of pp-slots sizes in particular bucket.
+ */
+#define PP_BUCKET_SIZE_RANGE 64
+#define NUM_PP_BUCKETS ((PAGE_SIZE / PP_BUCKET_SIZE_RANGE) + 1)
+
+struct zram_pp_ctl {
+ struct list_head pp_buckets[NUM_PP_BUCKETS];
+};
+
+static struct zram_pp_ctl *init_pp_ctl(void)
+{
+ struct zram_pp_ctl *ctl;
+ u32 idx;
+
+ ctl = kmalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return NULL;
+
+ for (idx = 0; idx < NUM_PP_BUCKETS; idx++)
+ INIT_LIST_HEAD(&ctl->pp_buckets[idx]);
+ return ctl;
+}
+
+static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps)
+{
+ list_del_init(&pps->entry);
+
+ zram_slot_lock(zram, pps->index);
+ zram_clear_flag(zram, pps->index, ZRAM_PP_SLOT);
+ zram_slot_unlock(zram, pps->index);
+
+ kfree(pps);
+}
+
+static void release_pp_ctl(struct zram *zram, struct zram_pp_ctl *ctl)
+{
+ u32 idx;
+
+ if (!ctl)
+ return;
+
+ for (idx = 0; idx < NUM_PP_BUCKETS; idx++) {
+ while (!list_empty(&ctl->pp_buckets[idx])) {
+ struct zram_pp_slot *pps;
+
+ pps = list_first_entry(&ctl->pp_buckets[idx],
+ struct zram_pp_slot,
+ entry);
+ release_pp_slot(zram, pps);
+ }
+ }
+
+ kfree(ctl);
+}
+
+static void place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl,
+ struct zram_pp_slot *pps)
+{
+ u32 idx;
+
+ idx = zram_get_obj_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE;
+ list_add(&pps->entry, &ctl->pp_buckets[idx]);
+
+ zram_set_flag(zram, pps->index, ZRAM_PP_SLOT);
+}
+
+static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl)
+{
+ struct zram_pp_slot *pps = NULL;
+ s32 idx = NUM_PP_BUCKETS - 1;
+
+ /* The higher the bucket id the more optimal slot post-processing is */
+ while (idx >= 0) {
+ pps = list_first_entry_or_null(&ctl->pp_buckets[idx],
+ struct zram_pp_slot,
+ entry);
+ if (pps)
+ break;
+
+ idx--;
+ }
+ return pps;
+}
+#endif
+
static inline void update_used_max(struct zram *zram,
const unsigned long pages)
{
@@ -296,19 +390,28 @@ static void mark_idle(struct zram *zram, ktime_t cutoff)
for (index = 0; index < nr_pages; index++) {
/*
- * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
- * See the comment in writeback_store.
+ * Do not mark ZRAM_SAME slots as ZRAM_IDLE, because no
+ * post-processing (recompress, writeback) happens to the
+ * ZRAM_SAME slot.
+ *
+ * And ZRAM_WB slots simply cannot be ZRAM_IDLE.
*/
zram_slot_lock(zram, index);
- if (zram_allocated(zram, index) &&
- !zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
+ if (!zram_allocated(zram, index) ||
+ zram_test_flag(zram, index, ZRAM_WB) ||
+ zram_test_flag(zram, index, ZRAM_SAME)) {
+ zram_slot_unlock(zram, index);
+ continue;
+ }
+
#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
- is_idle = !cutoff || ktime_after(cutoff,
- zram->table[index].ac_time);
+ is_idle = !cutoff ||
+ ktime_after(cutoff, zram->table[index].ac_time);
#endif
- if (is_idle)
- zram_set_flag(zram, index, ZRAM_IDLE);
- }
+ if (is_idle)
+ zram_set_flag(zram, index, ZRAM_IDLE);
+ else
+ zram_clear_flag(zram, index, ZRAM_IDLE);
zram_slot_unlock(zram, index);
}
}
@@ -587,11 +690,57 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
#define IDLE_WRITEBACK (1<<1)
#define INCOMPRESSIBLE_WRITEBACK (1<<2)
+static int scan_slots_for_writeback(struct zram *zram, u32 mode,
+ unsigned long nr_pages,
+ unsigned long index,
+ struct zram_pp_ctl *ctl)
+{
+ struct zram_pp_slot *pps = NULL;
+
+ for (; nr_pages != 0; index++, nr_pages--) {
+ if (!pps)
+ pps = kmalloc(sizeof(*pps), GFP_KERNEL);
+ if (!pps)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pps->entry);
+
+ zram_slot_lock(zram, index);
+ if (!zram_allocated(zram, index))
+ goto next;
+
+ if (zram_test_flag(zram, index, ZRAM_WB) ||
+ zram_test_flag(zram, index, ZRAM_SAME))
+ goto next;
+
+ if (mode & IDLE_WRITEBACK &&
+ !zram_test_flag(zram, index, ZRAM_IDLE))
+ goto next;
+ if (mode & HUGE_WRITEBACK &&
+ !zram_test_flag(zram, index, ZRAM_HUGE))
+ goto next;
+ if (mode & INCOMPRESSIBLE_WRITEBACK &&
+ !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
+ goto next;
+
+ pps->index = index;
+ place_pp_slot(zram, ctl, pps);
+ pps = NULL;
+next:
+ zram_slot_unlock(zram, index);
+ }
+
+ kfree(pps);
+ return 0;
+}
+
static ssize_t writeback_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+ struct zram_pp_ctl *ctl = NULL;
+ struct zram_pp_slot *pps;
unsigned long index = 0;
struct bio bio;
struct bio_vec bio_vec;
@@ -626,6 +775,12 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
+ /* Do not permit concurrent post-processing actions. */
+ if (atomic_xchg(&zram->pp_in_progress, 1)) {
+ up_read(&zram->init_lock);
+ return -EAGAIN;
+ }
+
if (!zram->backing_dev) {
ret = -ENODEV;
goto release_init_lock;
@@ -637,7 +792,15 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- for (; nr_pages != 0; index++, nr_pages--) {
+ ctl = init_pp_ctl();
+ if (!ctl) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ scan_slots_for_writeback(zram, mode, nr_pages, index, ctl);
+
+ while ((pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
spin_unlock(&zram->wb_limit_lock);
@@ -654,38 +817,20 @@ static ssize_t writeback_store(struct device *dev,
}
}
+ index = pps->index;
zram_slot_lock(zram, index);
- if (!zram_allocated(zram, index))
- goto next;
-
- if (zram_test_flag(zram, index, ZRAM_WB) ||
- zram_test_flag(zram, index, ZRAM_SAME) ||
- zram_test_flag(zram, index, ZRAM_UNDER_WB))
- goto next;
-
- if (mode & IDLE_WRITEBACK &&
- !zram_test_flag(zram, index, ZRAM_IDLE))
- goto next;
- if (mode & HUGE_WRITEBACK &&
- !zram_test_flag(zram, index, ZRAM_HUGE))
- goto next;
- if (mode & INCOMPRESSIBLE_WRITEBACK &&
- !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
- goto next;
-
/*
- * Clearing ZRAM_UNDER_WB is duty of caller.
- * IOW, zram_free_page never clear it.
+ * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
+ * slots can change in the meantime. If slots are accessed or
+ * freed they lose ZRAM_PP_SLOT flag and hence we don't
+ * post-process them.
*/
- zram_set_flag(zram, index, ZRAM_UNDER_WB);
- /* Need for hugepage writeback racing */
- zram_set_flag(zram, index, ZRAM_IDLE);
+ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+ goto next;
zram_slot_unlock(zram, index);
+
if (zram_read_page(zram, page, index, NULL)) {
- zram_slot_lock(zram, index);
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
- zram_clear_flag(zram, index, ZRAM_IDLE);
- zram_slot_unlock(zram, index);
+ release_pp_slot(zram, pps);
continue;
}
@@ -700,10 +845,7 @@ static ssize_t writeback_store(struct device *dev,
*/
err = submit_bio_wait(&bio);
if (err) {
- zram_slot_lock(zram, index);
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
- zram_clear_flag(zram, index, ZRAM_IDLE);
- zram_slot_unlock(zram, index);
+ release_pp_slot(zram, pps);
/*
* BIO errors are not fatal, we continue and simply
* attempt to writeback the remaining objects (pages).
@@ -717,25 +859,19 @@ static ssize_t writeback_store(struct device *dev,
}
atomic64_inc(&zram->stats.bd_writes);
+ zram_slot_lock(zram, index);
/*
- * We released zram_slot_lock so need to check if the slot was
- * changed. If there is freeing for the slot, we can catch it
- * easily by zram_allocated.
- * A subtle case is the slot is freed/reallocated/marked as
- * ZRAM_IDLE again. To close the race, idle_store doesn't
- * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
- * Thus, we could close the race by checking ZRAM_IDLE bit.
+ * Same as above, we release slot lock during writeback so
+ * slot can change under us: slot_free() or slot_free() and
+ * reallocation (zram_write_page()). In both cases slot loses
+ * ZRAM_PP_SLOT flag. No concurrent post-processing can set
+ * ZRAM_PP_SLOT on such slots until current post-processing
+ * finishes.
*/
- zram_slot_lock(zram, index);
- if (!zram_allocated(zram, index) ||
- !zram_test_flag(zram, index, ZRAM_IDLE)) {
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
- zram_clear_flag(zram, index, ZRAM_IDLE);
+ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
- }
zram_free_page(zram, index);
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
zram_set_flag(zram, index, ZRAM_WB);
zram_set_element(zram, index, blk_idx);
blk_idx = 0;
@@ -746,12 +882,15 @@ static ssize_t writeback_store(struct device *dev,
spin_unlock(&zram->wb_limit_lock);
next:
zram_slot_unlock(zram, index);
+ release_pp_slot(zram, pps);
}
if (blk_idx)
free_block_bdev(zram, blk_idx);
__free_page(page);
release_init_lock:
+ release_pp_ctl(zram, ctl);
+ atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
return ret;
@@ -1342,19 +1481,17 @@ static void zram_free_page(struct zram *zram, size_t index)
#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
zram->table[index].ac_time = 0;
#endif
- if (zram_test_flag(zram, index, ZRAM_IDLE))
- zram_clear_flag(zram, index, ZRAM_IDLE);
+
+ zram_clear_flag(zram, index, ZRAM_IDLE);
+ zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE);
+ zram_clear_flag(zram, index, ZRAM_PP_SLOT);
+ zram_set_priority(zram, index, 0);
if (zram_test_flag(zram, index, ZRAM_HUGE)) {
zram_clear_flag(zram, index, ZRAM_HUGE);
atomic64_dec(&zram->stats.huge_pages);
}
- if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
- zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE);
-
- zram_set_priority(zram, index, 0);
-
if (zram_test_flag(zram, index, ZRAM_WB)) {
zram_clear_flag(zram, index, ZRAM_WB);
free_block_bdev(zram, zram_get_element(zram, index));
@@ -1378,13 +1515,11 @@ static void zram_free_page(struct zram *zram, size_t index)
zs_free(zram->mem_pool, handle);
atomic64_sub(zram_get_obj_size(zram, index),
- &zram->stats.compr_data_size);
+ &zram->stats.compr_data_size);
out:
atomic64_dec(&zram->stats.pages_stored);
zram_set_handle(zram, index, 0);
zram_set_obj_size(zram, index, 0);
- WARN_ON_ONCE(zram->table[index].flags &
- ~(1UL << ZRAM_UNDER_WB));
}
/*
@@ -1648,6 +1783,52 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
}
#ifdef CONFIG_ZRAM_MULTI_COMP
+#define RECOMPRESS_IDLE (1 << 0)
+#define RECOMPRESS_HUGE (1 << 1)
+
+static int scan_slots_for_recompress(struct zram *zram, u32 mode,
+ struct zram_pp_ctl *ctl)
+{
+ unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+ struct zram_pp_slot *pps = NULL;
+ unsigned long index;
+
+ for (index = 0; index < nr_pages; index++) {
+ if (!pps)
+ pps = kmalloc(sizeof(*pps), GFP_KERNEL);
+ if (!pps)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pps->entry);
+
+ zram_slot_lock(zram, index);
+ if (!zram_allocated(zram, index))
+ goto next;
+
+ if (mode & RECOMPRESS_IDLE &&
+ !zram_test_flag(zram, index, ZRAM_IDLE))
+ goto next;
+
+ if (mode & RECOMPRESS_HUGE &&
+ !zram_test_flag(zram, index, ZRAM_HUGE))
+ goto next;
+
+ if (zram_test_flag(zram, index, ZRAM_WB) ||
+ zram_test_flag(zram, index, ZRAM_SAME) ||
+ zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
+ goto next;
+
+ pps->index = index;
+ place_pp_slot(zram, ctl, pps);
+ pps = NULL;
+next:
+ zram_slot_unlock(zram, index);
+ }
+
+ kfree(pps);
+ return 0;
+}
+
/*
* This function will decompress (unless it's ZRAM_HUGE) the page and then
* attempt to compress it using provided compression algorithm priority
@@ -1655,7 +1836,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
*
* Corresponding ZRAM slot should be locked.
*/
-static int zram_recompress(struct zram *zram, u32 index, struct page *page,
+static int recompress_slot(struct zram *zram, u32 index, struct page *page,
u64 *num_recomp_pages, u32 threshold, u32 prio,
u32 prio_max)
{
@@ -1685,6 +1866,13 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page,
if (ret)
return ret;
+ /*
+ * We touched this entry so mark it as non-IDLE. This makes sure that
+ * we don't preserve IDLE flag and don't incorrectly pick this entry
+ * for different post-processing type (e.g. writeback).
+ */
+ zram_clear_flag(zram, index, ZRAM_IDLE);
+
class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
/*
* Iterate the secondary comp algorithms list (in order of priority)
@@ -1798,20 +1986,17 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page,
return 0;
}
-#define RECOMPRESS_IDLE (1 << 0)
-#define RECOMPRESS_HUGE (1 << 1)
-
static ssize_t recompress_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS;
struct zram *zram = dev_to_zram(dev);
- unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
char *args, *param, *val, *algo = NULL;
u64 num_recomp_pages = ULLONG_MAX;
+ struct zram_pp_ctl *ctl = NULL;
+ struct zram_pp_slot *pps;
u32 mode = 0, threshold = 0;
- unsigned long index;
struct page *page;
ssize_t ret;
@@ -1881,6 +2066,12 @@ static ssize_t recompress_store(struct device *dev,
goto release_init_lock;
}
+ /* Do not permit concurrent post-processing actions. */
+ if (atomic_xchg(&zram->pp_in_progress, 1)) {
+ up_read(&zram->init_lock);
+ return -EAGAIN;
+ }
+
if (algo) {
bool found = false;
@@ -1907,36 +2098,32 @@ static ssize_t recompress_store(struct device *dev,
goto release_init_lock;
}
+ ctl = init_pp_ctl();
+ if (!ctl) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ scan_slots_for_recompress(zram, mode, ctl);
+
ret = len;
- for (index = 0; index < nr_pages; index++) {
+ while ((pps = select_pp_slot(ctl))) {
int err = 0;
if (!num_recomp_pages)
break;
- zram_slot_lock(zram, index);
-
- if (!zram_allocated(zram, index))
- goto next;
-
- if (mode & RECOMPRESS_IDLE &&
- !zram_test_flag(zram, index, ZRAM_IDLE))
- goto next;
-
- if (mode & RECOMPRESS_HUGE &&
- !zram_test_flag(zram, index, ZRAM_HUGE))
- goto next;
-
- if (zram_test_flag(zram, index, ZRAM_WB) ||
- zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
- zram_test_flag(zram, index, ZRAM_SAME) ||
- zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
+ zram_slot_lock(zram, pps->index);
+ if (!zram_test_flag(zram, pps->index, ZRAM_PP_SLOT))
goto next;
- err = zram_recompress(zram, index, page, &num_recomp_pages,
- threshold, prio, prio_max);
+ err = recompress_slot(zram, pps->index, page,
+ &num_recomp_pages, threshold,
+ prio, prio_max);
next:
- zram_slot_unlock(zram, index);
+ zram_slot_unlock(zram, pps->index);
+ release_pp_slot(zram, pps);
+
if (err) {
ret = err;
break;
@@ -1948,6 +2135,8 @@ next:
__free_page(page);
release_init_lock:
+ release_pp_ctl(zram, ctl);
+ atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
return ret;
}
@@ -2105,7 +2294,7 @@ static void zram_destroy_comps(struct zram *zram)
{
u32 prio;
- for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
+ for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
struct zcomp *comp = zram->comps[prio];
zram->comps[prio] = NULL;
@@ -2144,6 +2333,7 @@ static void zram_reset_device(struct zram *zram)
zram->disksize = 0;
zram_destroy_comps(zram);
memset(&zram->stats, 0, sizeof(zram->stats));
+ atomic_set(&zram->pp_in_progress, 0);
reset_bdev(zram);
comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
@@ -2176,7 +2366,7 @@ static ssize_t disksize_store(struct device *dev,
goto out_unlock;
}
- for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
+ for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
if (!zram->comp_algs[prio])
continue;
@@ -2381,6 +2571,9 @@ static int zram_add(void)
zram->disk->fops = &zram_devops;
zram->disk->private_data = zram;
snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
+ atomic_set(&zram->pp_in_progress, 0);
+ zram_comp_params_reset(zram);
+ comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
/* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
set_capacity(zram->disk, 0);
@@ -2388,9 +2581,6 @@ static int zram_add(void)
if (ret)
goto out_cleanup_disk;
- zram_comp_params_reset(zram);
- comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
-
zram_debugfs_register(zram);
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index cfc8c059db63..134be414e210 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -47,7 +47,7 @@
enum zram_pageflags {
ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */
ZRAM_WB, /* page is stored on backing_device */
- ZRAM_UNDER_WB, /* page is under writeback */
+ ZRAM_PP_SLOT, /* Selected for post-processing */
ZRAM_HUGE, /* Incompressible page */
ZRAM_IDLE, /* not accessed page since last idle marking */
ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */
@@ -139,5 +139,6 @@ struct zram {
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
struct dentry *debugfs_dir;
#endif
+ atomic_t pp_in_progress;
};
#endif
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a28197b88c92..18f839721813 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -6,6 +6,7 @@
*/
#include <linux/iova.h>
+#include <linux/kmemleak.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp.h>
@@ -673,6 +674,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
{
struct iova_magazine *mag = rcache->depot;
+ /*
+ * As the mag->next pointer is moved to rcache->depot and reset via
+ * the mag->size assignment, mark it as a transient false positive.
+ */
+ kmemleak_transient_leak(mag->next);
rcache->depot = mag->next;
mag->size = IOVA_MAG_SIZE;
rcache->depot_size--;
diff --git a/fs/buffer.c b/fs/buffer.c
index b158cb7a5038..cc8452f60251 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -736,15 +736,12 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
- folio_memcg_lock(folio);
newly_dirty = !folio_test_set_dirty(folio);
spin_unlock(&mapping->i_private_lock);
if (newly_dirty)
__folio_mark_dirty(folio, mapping, 1);
- folio_memcg_unlock(folio);
-
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1193,13 +1190,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
struct folio *folio = bh->b_folio;
struct address_space *mapping = NULL;
- folio_memcg_lock(folio);
if (!folio_test_set_dirty(folio)) {
mapping = folio->mapping;
if (mapping)
__folio_mark_dirty(folio, mapping, 0);
}
- folio_memcg_unlock(folio);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 0099077a2982..b4d5e9e1e43d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1099,7 +1099,7 @@ void shrink_dentry_list(struct list_head *list)
}
static enum lru_status dentry_lru_isolate(struct list_head *item,
- struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -1180,7 +1180,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
}
static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
- struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e6bc77f4f81..72b48f6f5561 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -149,7 +149,7 @@ static void gfs2_qd_list_dispose(struct list_head *list)
static enum lru_status gfs2_qd_isolate(struct list_head *item,
- struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, void *arg)
{
struct list_head *dispose = arg;
struct gfs2_quota_data *qd =
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1bbf783b244a..a4441fb77f7c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -174,103 +174,29 @@ out:
* Called under mmap_write_lock(mm).
*/
-static unsigned long
-hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info = {};
-
- info.length = len;
- info.low_limit = current->mm->mmap_base;
- info.high_limit = arch_get_mmap_end(addr, len, flags);
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- return vm_unmapped_area(&info);
-}
-
-static unsigned long
-hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info = {};
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = PAGE_SIZE;
- info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (unlikely(offset_in_page(addr))) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = current->mm->mmap_base;
- info.high_limit = arch_get_mmap_end(addr, len, flags);
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
unsigned long
-generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
+ unsigned long addr0 = 0;
struct hstate *h = hstate_file(file);
- const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len & ~huge_page_mask(h))
return -EINVAL;
- if (len > mmap_end - mmap_min_addr)
- return -ENOMEM;
-
if (flags & MAP_FIXED) {
+ if (addr & ~huge_page_mask(h))
+ return -EINVAL;
if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma_prev(mm, addr, &prev);
- if (mmap_end - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)) &&
- (!prev || addr >= vm_end_gap(prev)))
- return addr;
}
+ if (addr)
+ addr0 = ALIGN(addr, huge_page_size(h));
- /*
- * Use MMF_TOPDOWN flag as a hint to use topdown routine.
- * If architectures have special needs, they should define their own
- * version of hugetlb_get_unmapped_area.
- */
- if (test_bit(MMF_TOPDOWN, &mm->flags))
- return hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
- return hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
+ return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff,
+ flags, 0);
}
-#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-static unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
-}
-#endif
-
/*
* Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
* Returns the maximum number of bytes one can read without touching the 1st raw
diff --git a/fs/inode.c b/fs/inode.c
index b13b778257ae..6b4c77268fc0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -952,7 +952,7 @@ again:
* with this flag set because they are the inodes that are out of order.
*/
static enum lru_status inode_lru_isolate(struct list_head *item,
- struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, void *arg)
{
struct list_head *freeable = arg;
struct inode *inode = container_of(item, struct inode, i_lru);
@@ -994,7 +994,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
inode_pin_lru_isolating(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
if (remove_inode_buffers(inode)) {
unsigned long reap;
reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
@@ -1005,7 +1005,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
mm_account_reclaimed_pages(reap);
}
inode_unpin_lru_isolating(inode);
- spin_lock(lru_lock);
return LRU_RETRY;
}
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index b6e3d8f77b91..37d79400e5f4 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -802,7 +802,7 @@ static struct shrinker *nfs4_xattr_large_entry_shrinker;
static enum lru_status
cache_lru_isolate(struct list_head *item,
- struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, void *arg)
{
struct list_head *dispose = arg;
struct inode *inode;
@@ -867,7 +867,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
static enum lru_status
entry_lru_isolate(struct list_head *item,
- struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, void *arg)
{
struct list_head *dispose = arg;
struct nfs4_xattr_bucket *bucket;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 2e6783f63712..09c444eb944f 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -487,7 +487,6 @@ void nfsd_file_net_dispose(struct nfsd_net *nn)
* nfsd_file_lru_cb - Examine an entry on the LRU list
* @item: LRU entry to examine
* @lru: controlling LRU
- * @lock: LRU list lock (unused)
* @arg: dispose list
*
* Return values:
@@ -497,9 +496,7 @@ void nfsd_file_net_dispose(struct nfsd_net *nn)
*/
static enum lru_status
nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
- spinlock_t *lock, void *arg)
- __releases(lock)
- __acquires(lock)
+ void *arg)
{
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3e31a4805427..0edf14a9840e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -831,19 +831,21 @@ static const struct file_operations proc_single_file_operations = {
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
struct task_struct *task = get_proc_task(inode);
- struct mm_struct *mm = ERR_PTR(-ESRCH);
+ struct mm_struct *mm;
- if (task) {
- mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
- put_task_struct(task);
+ if (!task)
+ return ERR_PTR(-ESRCH);
- if (!IS_ERR_OR_NULL(mm)) {
- /* ensure this mm_struct can't be freed */
- mmgrab(mm);
- /* but do not pin its memory */
- mmput(mm);
- }
- }
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
+ put_task_struct(task);
+
+ if (IS_ERR(mm))
+ return mm == ERR_PTR(-ESRCH) ? NULL : mm;
+
+ /* ensure this mm_struct can't be freed */
+ mmgrab(mm);
+ /* but do not pin its memory */
+ mmput(mm);
return mm;
}
@@ -2207,7 +2209,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_notask;
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm))
+ if (IS_ERR(mm))
goto out;
if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 245171d9164b..8ba9b1472390 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -91,7 +91,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_ZSWAP
show_val_kb(m, "Zswap: ", zswap_total_pages());
seq_printf(m, "Zswapped: %8lu kB\n",
- (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (unsigned long)atomic_long_read(&zswap_stored_pages) <<
(PAGE_SHIFT - 10));
#endif
show_val_kb(m, "Dirty: ",
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e8196f5778e2..aa63b8efd782 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1857,7 +1857,6 @@ static enum lru_status
xfs_buftarg_drain_rele(
struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
void *arg)
{
@@ -1956,7 +1955,6 @@ static enum lru_status
xfs_buftarg_isolate(
struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
void *arg)
{
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b928b036990b..61ee110b47d7 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -441,9 +441,8 @@ static enum lru_status
xfs_qm_dquot_isolate(
struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
void *arg)
- __releases(lru_lock) __acquires(lru_lock)
+ __releases(&lru->lock) __acquires(&lru->lock)
{
struct xfs_dquot *dqp = container_of(item,
struct xfs_dquot, q_lru);
@@ -489,7 +488,7 @@ xfs_qm_dquot_isolate(
trace_xfs_dqreclaim_dirty(dqp);
/* we have to drop the LRU lock to flush the dquot */
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
error = xfs_qm_dqflush(dqp, &bp);
if (error)
@@ -525,7 +524,6 @@ out_unlock_dirty:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
xfs_dqunlock(dqp);
- spin_lock(lru_lock);
return LRU_RETRY;
}
diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
index 64f536b80380..372c320c5043 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -11,4 +11,23 @@
#define CODETAG_SECTIONS() \
SECTION_WITH_BOUNDARIES(alloc_tags)
+/*
+ * Module codetags which aren't used after module unload, therefore have the
+ * same lifespan as the module and can be safely unloaded with the module.
+ */
+#define MOD_CODETAG_SECTIONS()
+
+#define MOD_SEPARATE_CODETAG_SECTION(_name) \
+ .codetag.##_name : { \
+ SECTION_WITH_BOUNDARIES(_name) \
+ }
+
+/*
+ * For codetags which might be used after module unload, therefore might stay
+ * longer in memory. Each such codetag type has its own section so that we can
+ * unload them individually once unused.
+ */
+#define MOD_SEPARATE_CODETAG_SECTIONS() \
+ MOD_SEPARATE_CODETAG_SECTION(alloc_tags)
+
#endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 594d5905f615..f42133dae68e 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -42,20 +42,26 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
+#ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
{
return huge_pte_wrprotect(pte_mkuffd_wp(pte));
}
+#endif
+#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
{
return pte_clear_uffd_wp(pte);
}
+#endif
+#ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP
static inline int huge_pte_uffd_wp(pte_t pte)
{
return pte_uffd_wp(pte);
}
+#endif
#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
@@ -106,22 +112,17 @@ static inline int huge_pte_none(pte_t pte)
#endif
/* Please refer to comments above pte_none_mostly() for the usage */
+#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY
static inline int huge_pte_none_mostly(pte_t pte)
{
return huge_pte_none(pte) || is_pte_marker(pte);
}
+#endif
#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
- struct hstate *h = hstate_file(file);
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
-
return 0;
}
#endif
diff --git a/include/asm-generic/text-patching.h b/include/asm-generic/text-patching.h
new file mode 100644
index 000000000000..2245c641b741
--- /dev/null
+++ b/include/asm-generic/text-patching.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_TEXT_PATCHING_H
+#define _ASM_GENERIC_TEXT_PATCHING_H
+
+#endif /* _ASM_GENERIC_TEXT_PATCHING_H */
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 941deffc590d..7c0786bdf9af 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -30,6 +30,21 @@ struct alloc_tag {
struct alloc_tag_counters __percpu *counters;
} __aligned(8);
+struct alloc_tag_kernel_section {
+ struct alloc_tag *first_tag;
+ unsigned long count;
+};
+
+struct alloc_tag_module_section {
+ union {
+ unsigned long start_addr;
+ struct alloc_tag *first_tag;
+ };
+ unsigned long end_addr;
+ /* used size */
+ unsigned long size;
+};
+
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
#define CODETAG_EMPTY ((void *)1)
@@ -54,6 +69,8 @@ static inline void set_codetag_empty(union codetag_ref *ref) {}
#ifdef CONFIG_MEM_ALLOC_PROFILING
+#define ALLOC_TAG_SECTION_NAME "alloc_tags"
+
struct codetag_bytes {
struct codetag *ct;
s64 bytes;
@@ -76,7 +93,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
#define DEFINE_ALLOC_TAG(_alloc_tag) \
static struct alloc_tag _alloc_tag __used __aligned(8) \
- __section("alloc_tags") = { \
+ __section(ALLOC_TAG_SECTION_NAME) = { \
.ct = CODE_TAG_INIT, \
.counters = &_shared_alloc_tag };
@@ -85,7 +102,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
#define DEFINE_ALLOC_TAG(_alloc_tag) \
static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \
static struct alloc_tag _alloc_tag __used __aligned(8) \
- __section("alloc_tags") = { \
+ __section(ALLOC_TAG_SECTION_NAME) = { \
.ct = CODE_TAG_INIT, \
.counters = &_alloc_tag_cntr };
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index cffa38a73618..d8a8d245824a 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -6,11 +6,10 @@
#include <linux/kmemleak.h>
/*
- * Types for free bootmem stored in page->lru.next. These have to be in
- * some random range in unsigned long space for debugging purposes.
+ * Types for free bootmem stored in the low bits of page->private.
*/
-enum {
- MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+enum bootmem_type {
+ MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1,
SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
MIX_SECTION_INFO,
NODE_INFO,
@@ -21,9 +20,19 @@ enum {
void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
void get_page_bootmem(unsigned long info, struct page *page,
- unsigned long type);
+ enum bootmem_type type);
void put_page_bootmem(struct page *page);
+static inline enum bootmem_type bootmem_type(const struct page *page)
+{
+ return (unsigned long)page->private & 0xf;
+}
+
+static inline unsigned long bootmem_info(const struct page *page)
+{
+ return (unsigned long)page->private >> 4;
+}
+
/*
* Any memory allocated via the memblock allocator and not via the
* buddy will be marked reserved already in the memmap. For those
@@ -31,7 +40,7 @@ void put_page_bootmem(struct page *page);
*/
static inline void free_bootmem_page(struct page *page)
{
- unsigned long magic = page->index;
+ enum bootmem_type type = bootmem_type(page);
/*
* The reserve_bootmem_region sets the reserved flag on bootmem
@@ -39,7 +48,7 @@ static inline void free_bootmem_page(struct page *page)
*/
VM_BUG_ON_PAGE(page_ref_count(page) != 2, page);
- if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
+ if (type == SECTION_INFO || type == MIX_SECTION_INFO)
put_page_bootmem(page);
else
VM_BUG_ON_PAGE(1, page);
@@ -53,8 +62,18 @@ static inline void put_page_bootmem(struct page *page)
{
}
+static inline enum bootmem_type bootmem_type(const struct page *page)
+{
+ return SECTION_INFO;
+}
+
+static inline unsigned long bootmem_info(const struct page *page)
+{
+ return 0;
+}
+
static inline void get_page_bootmem(unsigned long info, struct page *page,
- unsigned long type)
+ enum bootmem_type type)
{
}
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index c2a579ccd455..d14dbd26b370 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -13,6 +13,9 @@ struct codetag_module;
struct seq_buf;
struct module;
+#define CODETAG_SECTION_START_PREFIX "__start_"
+#define CODETAG_SECTION_STOP_PREFIX "__stop_"
+
/*
* An instance of this structure is created in a special ELF section at every
* code location being tagged. At runtime, the special section is treated as
@@ -35,8 +38,15 @@ struct codetag_type_desc {
size_t tag_size;
void (*module_load)(struct codetag_type *cttype,
struct codetag_module *cmod);
- bool (*module_unload)(struct codetag_type *cttype,
+ void (*module_unload)(struct codetag_type *cttype,
struct codetag_module *cmod);
+#ifdef CONFIG_MODULES
+ void (*module_replaced)(struct module *mod, struct module *new_mod);
+ bool (*needs_section_mem)(struct module *mod, unsigned long size);
+ void *(*alloc_section_mem)(struct module *mod, unsigned long size,
+ unsigned int prepend, unsigned long align);
+ void (*free_section_mem)(struct module *mod, bool used);
+#endif
};
struct codetag_iterator {
@@ -71,11 +81,31 @@ struct codetag_type *
codetag_register_type(const struct codetag_type_desc *desc);
#if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES)
+
+bool codetag_needs_module_section(struct module *mod, const char *name,
+ unsigned long size);
+void *codetag_alloc_module_section(struct module *mod, const char *name,
+ unsigned long size, unsigned int prepend,
+ unsigned long align);
+void codetag_free_module_sections(struct module *mod);
+void codetag_module_replaced(struct module *mod, struct module *new_mod);
void codetag_load_module(struct module *mod);
-bool codetag_unload_module(struct module *mod);
-#else
+void codetag_unload_module(struct module *mod);
+
+#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
+
+static inline bool
+codetag_needs_module_section(struct module *mod, const char *name,
+ unsigned long size) { return false; }
+static inline void *
+codetag_alloc_module_section(struct module *mod, const char *name,
+ unsigned long size, unsigned int prepend,
+ unsigned long align) { return NULL; }
+static inline void codetag_free_module_sections(struct module *mod) {}
+static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
static inline void codetag_load_module(struct module *mod) {}
-static inline bool codetag_unload_module(struct module *mod) { return true; }
-#endif
+static inline void codetag_unload_module(struct module *mod) {}
+
+#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
#endif /* _LINUX_CODETAG_H */
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 32cef1144117..64130ae19690 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -46,11 +46,27 @@ enum execmem_type {
/**
* enum execmem_range_flags - options for executable memory allocations
* @EXECMEM_KASAN_SHADOW: allocate kasan shadow
+ * @EXECMEM_ROX_CACHE: allocations should use ROX cache of huge pages
*/
enum execmem_range_flags {
EXECMEM_KASAN_SHADOW = (1 << 0),
+ EXECMEM_ROX_CACHE = (1 << 1),
};
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+/**
+ * execmem_fill_trapping_insns - set memory to contain instructions that
+ * will trap
+ * @ptr: pointer to memory to fill
+ * @size: size of the range to fill
+ * @writable: is the memory poited by @ptr is writable or ROX
+ *
+ * A hook for architecures to fill execmem ranges with invalid instructions.
+ * Architectures that use EXECMEM_ROX_CACHE must implement this.
+ */
+void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
+#endif
+
/**
* struct execmem_range - definition of an address space suitable for code and
* related data allocations
@@ -123,6 +139,39 @@ void *execmem_alloc(enum execmem_type type, size_t size);
*/
void execmem_free(void *ptr);
+#ifdef CONFIG_MMU
+/**
+ * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory
+ * @size: size of the virtual mapping in bytes
+ *
+ * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
+ */
+struct vm_struct *execmem_vmap(size_t size);
+#endif
+
+/**
+ * execmem_update_copy - copy an update to executable memory
+ * @dst: destination address to update
+ * @src: source address containing the data
+ * @size: how many bytes of memory shold be copied
+ *
+ * Copy @size bytes from @src to @dst using text poking if the memory at
+ * @dst is read-only.
+ *
+ * Return: a pointer to @dst or NULL on error
+ */
+void *execmem_update_copy(void *dst, const void *src, size_t size);
+
+/**
+ * execmem_is_rox - check if execmem is read-only
+ * @type - the execmem type to check
+ *
+ * Return: %true if the @type is read-only, %false if it's writable
+ */
+bool execmem_is_rox(enum execmem_type type);
+
#if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE)
void execmem_init(void);
#else
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a0a6d25f883f..b0fe9f62d15b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -306,7 +306,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, bool hugepage);
+ unsigned long addr);
#else
static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
@@ -326,7 +326,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde
{
return folio_alloc_noprof(gfp, order);
}
-#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \
+#define vma_alloc_folio_noprof(gfp, order, vma, addr) \
folio_alloc_noprof(gfp, order)
#endif
@@ -341,7 +341,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde
static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
struct vm_area_struct *vma, unsigned long addr)
{
- struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);
+ struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr);
return &folio->page;
}
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 930a591b9b61..6e452bd8e7e3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -224,13 +224,7 @@ static inline
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
unsigned long vaddr)
{
- struct folio *folio;
-
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false);
- if (folio)
- clear_user_highpage(&folio->page, vaddr);
-
- return folio;
+ return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr);
}
#endif
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ef5b80e48599..b94c2e8ee918 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -2,7 +2,6 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H
-#include <linux/sched/coredump.h>
#include <linux/mm_types.h>
#include <linux/fs.h> /* only for vma_is_dax() */
@@ -120,6 +119,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_ALLOC,
MTHP_STAT_ANON_FAULT_FALLBACK,
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_ZSWPOUT,
+ MTHP_STAT_SWPIN,
MTHP_STAT_SWPOUT,
MTHP_STAT_SWPOUT_FALLBACK,
MTHP_STAT_SHMEM_ALLOC,
@@ -253,19 +254,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
return orders;
}
-static inline bool file_thp_enabled(struct vm_area_struct *vma)
-{
- struct inode *inode;
-
- if (!vma->vm_file)
- return false;
-
- inode = vma->vm_file->f_inode;
-
- return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
- !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
-}
-
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
unsigned long tva_flags,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e4697539b665..ae4fe8615bb6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -546,16 +546,10 @@ static inline struct hstate *hstate_inode(struct inode *i)
}
#endif /* !CONFIG_HUGETLBFS */
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags);
-#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
-
unsigned long
-generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags);
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags);
/*
* huegtlb page specific state flags. These flags are located in page.private
@@ -1035,9 +1029,19 @@ void hugetlb_unregister_node(struct node *node);
*/
bool is_raw_hwpoison_page_in_hugepage(struct page *page);
+static inline unsigned long huge_page_mask_align(struct file *file)
+{
+ return PAGE_MASK & ~huge_page_mask(hstate_file(file));
+}
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+static inline unsigned long huge_page_mask_align(struct file *file)
+{
+ return 0;
+}
+
static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
return NULL;
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 00a3bf7c0d8f..6bbfc8aa42e8 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -29,6 +29,9 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t;
#define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u)
#define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u)
+#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
+#define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */
+
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#include <linux/pgtable.h>
@@ -564,7 +567,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
- unsigned long free_region_end);
+ unsigned long free_region_end,
+ unsigned long flags);
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -579,7 +583,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
static inline void kasan_release_vmalloc(unsigned long start,
unsigned long end,
unsigned long free_region_start,
- unsigned long free_region_end) { }
+ unsigned long free_region_end,
+ unsigned long flags) { }
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -614,7 +619,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
static inline void kasan_release_vmalloc(unsigned long start,
unsigned long end,
unsigned long free_region_start,
- unsigned long free_region_end) { }
+ unsigned long free_region_end,
+ unsigned long flags) { }
static inline void *kasan_unpoison_vmalloc(const void *start,
unsigned long size,
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 30baae91b225..1f46046080f5 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -2,8 +2,6 @@
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
-#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
-
extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 6a3cd1bf4680..93a73c076d16 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
extern void kmemleak_update_trace(const void *ptr) __ref;
extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_transient_leak(const void *ptr) __ref;
extern void kmemleak_ignore(const void *ptr) __ref;
extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
extern void kmemleak_no_scan(const void *ptr) __ref;
@@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr)
static inline void kmemleak_not_leak(const void *ptr)
{
}
+static inline void kmemleak_transient_leak(const void *ptr)
+{
+}
static inline void kmemleak_ignore(const void *ptr)
{
}
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index ec9c05044d4f..6a53ac4885bb 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -13,7 +13,6 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>
-#include <linux/sched/coredump.h>
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
@@ -91,7 +90,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
-void collect_procs_ksm(struct folio *folio, struct page *page,
+void collect_procs_ksm(const struct folio *folio, const struct page *page,
struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);
@@ -123,8 +122,9 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}
-static inline void collect_procs_ksm(struct folio *folio, struct page *page,
- struct list_head *to_kill, int force_early)
+static inline void collect_procs_ksm(const struct folio *folio,
+ const struct page *page, struct list_head *to_kill,
+ int force_early)
{
}
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 5099a8ccd5f4..05c166811f6b 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -32,6 +32,8 @@ struct list_lru_one {
struct list_head list;
/* may become negative during memcg reparenting */
long nr_items;
+ /* protects all fields above */
+ spinlock_t lock;
};
struct list_lru_memcg {
@@ -41,11 +43,9 @@ struct list_lru_memcg {
};
struct list_lru_node {
- /* protects all lists on the node, including per cgroup */
- spinlock_t lock;
/* global list, used for the root cgroup in cgroup aware lrus */
struct list_lru_one lru;
- long nr_items;
+ atomic_long_t nr_items;
} ____cacheline_aligned_in_smp;
struct list_lru {
@@ -56,16 +56,28 @@ struct list_lru {
bool memcg_aware;
struct xarray xa;
#endif
+#ifdef CONFIG_LOCKDEP
+ struct lock_class_key *key;
+#endif
};
void list_lru_destroy(struct list_lru *lru);
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key, struct shrinker *shrinker);
+ struct shrinker *shrinker);
#define list_lru_init(lru) \
- __list_lru_init((lru), false, NULL, NULL)
+ __list_lru_init((lru), false, NULL)
#define list_lru_init_memcg(lru, shrinker) \
- __list_lru_init((lru), true, NULL, shrinker)
+ __list_lru_init((lru), true, shrinker)
+
+static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker *shrinker,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_LOCKDEP
+ lru->key = key;
+#endif
+ return list_lru_init_memcg(lru, shrinker);
+}
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp);
@@ -172,7 +184,7 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head);
typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
- struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+ struct list_lru_one *list, void *cb_arg);
/**
* list_lru_walk_one: walk a @lru, isolating and disposing freeable items.
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index c2c11004085e..cbbcd18d4186 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -224,7 +224,7 @@ typedef struct { /* nothing */ } lockdep_map_p;
* (set at tree creation time) and dynamic information set under the spinlock.
*
* Another use of flags are to indicate global states of the tree. This is the
- * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
+ * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in
* RCU mode. This mode was added to allow the tree to reuse nodes instead of
* re-allocating and RCU freeing nodes when there is a single user.
*/
@@ -592,6 +592,20 @@ static __always_inline void mas_reset(struct ma_state *mas)
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
+/**
+ * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order.
+ * @__mas: Maple Tree operation state (maple_state)
+ * @__entry: Entry retrieved from the tree
+ * @__min: minimum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range for the
+ * entry.
+ *
+ * Note: may return the zero entry.
+ */
+#define mas_for_each_rev(__mas, __entry, __min) \
+ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL)
+
#ifdef CONFIG_DEBUG_MAPLE_TREE
enum mt_dump_format {
mt_dump_dec,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e1b41554a5fb..5502aa8e138e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -299,25 +299,10 @@ struct mem_cgroup {
/* For oom notifier event fd */
struct list_head oom_notify;
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
- /* taken only while moving_account > 0 */
- spinlock_t move_lock;
- unsigned long move_lock_flags;
-
/* Legacy tcp memory accounting */
bool tcpmem_active;
int tcpmem_pressure;
- /*
- * set > 0 if pages under this cgroup are moving to other cgroup.
- */
- atomic_t moving_account;
- struct task_struct *move_lock_task;
-
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -433,9 +418,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
*
* - the folio lock
* - LRU isolation
- * - folio_memcg_lock()
* - exclusive reference
- * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -460,35 +443,6 @@ static inline bool folio_memcg_charged(struct folio *folio)
return __folio_memcg(folio) != NULL;
}
-/**
- * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
- * @folio: Pointer to the folio.
- *
- * This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios.
- *
- * Return: A pointer to the memory cgroup associated with the folio,
- * or NULL.
- */
-static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
-{
- unsigned long memcg_data = READ_ONCE(folio->memcg_data);
-
- VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-
- if (memcg_data & MEMCG_DATA_KMEM) {
- struct obj_cgroup *objcg;
-
- objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- return obj_cgroup_memcg(objcg);
- }
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-}
-
/*
* folio_memcg_check - Get the memory cgroup associated with a folio.
* @folio: Pointer to the folio.
@@ -504,9 +458,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
*
* - the folio lock
* - LRU isolation
- * - lock_folio_memcg()
* - exclusive reference
- * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -1103,10 +1055,9 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
return NULL;
}
-static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
+static inline bool folio_memcg_charged(struct folio *folio)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return NULL;
+ return false;
}
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
@@ -1282,6 +1233,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
return NULL;
}
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+{
+}
+
static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}
@@ -1874,26 +1829,6 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
return p->memcg_in_oom;
}
-void folio_memcg_lock(struct folio *folio);
-void folio_memcg_unlock(struct folio *folio);
-
-/* try to stablize folio_memcg() for all the pages in a memcg */
-static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
-{
- rcu_read_lock();
-
- if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
- return true;
-
- rcu_read_unlock();
- return false;
-}
-
-static inline void mem_cgroup_unlock_pages(void)
-{
- rcu_read_unlock();
-}
-
static inline void mem_cgroup_enter_user_fault(void)
{
WARN_ON(current->in_user_fault);
@@ -1915,26 +1850,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
return 0;
}
-static inline void folio_memcg_lock(struct folio *folio)
-{
-}
-
-static inline void folio_memcg_unlock(struct folio *folio)
-{
-}
-
-static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
-{
- /* to match folio_memcg_rcu() */
- rcu_read_lock();
- return true;
-}
-
-static inline void mem_cgroup_unlock_pages(void)
-{
- rcu_read_unlock();
-}
-
static inline bool task_in_memcg_oom(struct task_struct *p)
{
return false;
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1add16f21612..ce9885e0178a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -47,7 +47,7 @@ struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
- nodemask_t nodes; /* interleave/bind/perfer */
+ nodemask_t nodes; /* interleave/bind/preferred/etc */
int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
union {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 673771f34674..2bbf73eb53e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,11 +97,11 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
-#ifndef PHYSMEM_END
+#ifndef DIRECT_MAP_PHYSMEM_END
# ifdef MAX_PHYSMEM_BITS
-# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
# else
-# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63))
+# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63))
# endif
#endif
@@ -1298,8 +1298,6 @@ static inline struct folio *virt_to_folio(const void *x)
void __folio_put(struct folio *folio);
-void put_pages_list(struct list_head *pages);
-
void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
int folio_mc_copy(struct folio *dst, struct folio *src);
@@ -1907,7 +1905,7 @@ static inline unsigned long page_to_section(const struct page *page)
*
* Return: The Page Frame Number of the first page in the folio.
*/
-static inline unsigned long folio_pfn(struct folio *folio)
+static inline unsigned long folio_pfn(const struct folio *folio)
{
return page_to_pfn(&folio->page);
}
@@ -3028,8 +3026,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp);
+pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, pmd_t *pmdvalp,
+ spinlock_t **ptlp);
#define pte_unmap_unlock(pte, ptl) do { \
spin_unlock(ptl); \
@@ -3831,9 +3832,6 @@ void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
-void pud_init(void *addr);
-void pmd_init(void *addr);
-void kernel_pte_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
@@ -4176,63 +4174,6 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla
}
#endif
-#ifdef CONFIG_MEM_ALLOC_PROFILING
-static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
-{
- int i;
- struct alloc_tag *tag;
- unsigned int nr_pages = 1 << new_order;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- tag = pgalloc_tag_get(&folio->page);
- if (!tag)
- return;
-
- for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
- union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i));
-
- if (ref) {
- /* Set new reference to point to the original tag */
- alloc_tag_ref_set(ref, tag);
- put_page_tag_ref(ref);
- }
- }
-}
-
-static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
-{
- struct alloc_tag *tag;
- union codetag_ref *ref;
-
- tag = pgalloc_tag_get(&old->page);
- if (!tag)
- return;
-
- ref = get_page_tag_ref(&new->page);
- if (!ref)
- return;
-
- /* Clear the old ref to the original allocation tag. */
- clear_page_tag_ref(&old->page);
- /* Decrement the counters of the tag on get_new_folio. */
- alloc_tag_sub(ref, folio_nr_pages(new));
-
- __alloc_tag_ref_set(ref, tag);
-
- put_page_tag_ref(ref);
-}
-#else /* !CONFIG_MEM_ALLOC_PROFILING */
-static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
-{
-}
-
-static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
-{
-}
-#endif /* CONFIG_MEM_ALLOC_PROFILING */
-
int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f4fe593c1400..1b6a917fffa4 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -155,6 +155,11 @@ static inline int folio_lru_refs(struct folio *folio)
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
}
+static inline void folio_clear_lru_refs(struct folio *folio)
+{
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+}
+
static inline int folio_lru_gen(struct folio *folio)
{
unsigned long flags = READ_ONCE(folio->flags);
@@ -222,6 +227,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
{
unsigned long seq;
unsigned long flags;
+ unsigned long mask;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -257,7 +263,14 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+ mask = LRU_GEN_MASK;
+ /*
+ * Don't clear PG_workingset here because it can affect PSI accounting
+ * if the activation is due to workingset refault.
+ */
+ if (folio_test_active(folio))
+ mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active);
+ set_mask_bits(&folio->flags, mask, flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
@@ -291,6 +304,12 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return true;
}
+static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+{
+ unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK;
+
+ set_mask_bits(&new->flags, LRU_REFS_MASK, refs);
+}
#else /* !CONFIG_LRU_GEN */
static inline bool lru_gen_enabled(void)
@@ -313,6 +332,10 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return false;
}
+static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+{
+
+}
#endif /* CONFIG_LRU_GEN */
static __always_inline
@@ -521,7 +544,7 @@ static inline pte_marker copy_pte_marker(
{
pte_marker srcm = pte_marker_get(entry);
/* Always copy error entries. */
- pte_marker dstm = srcm & PTE_MARKER_POISONED;
+ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
/* Only copy PTE markers if UFFD register matches. */
if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e85beea1206e..7361a8f3ab68 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1535,4 +1535,88 @@ enum {
/* See also internal only FOLL flags in mm/internal.h */
};
+/* mm flags */
+
+/*
+ * The first two bits represent core dump modes for set-user-ID,
+ * the modes are SUID_DUMP_* defined in linux/sched/coredump.h
+ */
+#define MMF_DUMPABLE_BITS 2
+#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+/* coredump filter bits */
+#define MMF_DUMP_ANON_PRIVATE 2
+#define MMF_DUMP_ANON_SHARED 3
+#define MMF_DUMP_MAPPED_PRIVATE 4
+#define MMF_DUMP_MAPPED_SHARED 5
+#define MMF_DUMP_ELF_HEADERS 6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED 8
+#define MMF_DUMP_DAX_PRIVATE 9
+#define MMF_DUMP_DAX_SHARED 10
+
+#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
+#define MMF_DUMP_FILTER_BITS 9
+#define MMF_DUMP_FILTER_MASK \
+ (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+#define MMF_DUMP_FILTER_DEFAULT \
+ ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
+ (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF 0
+#endif
+ /* leave room for more dump flags */
+#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */
+
+/*
+ * This one-shot flag is dropped due to necessity of changing exe once again
+ * on NFS restore
+ */
+//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
+
+#define MMF_HAS_UPROBES 19 /* has uprobes */
+#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
+#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
+#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
+#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
+/*
+ * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either
+ * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
+ * a counter on its own. We're aggresive on this bit for now: even if the
+ * pinned pages were unpinned later on, we'll still keep this bit set for the
+ * lifecycle of this mm, just for simplicity.
+ */
+#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
+
+#define MMF_HAS_MDWE 28
+#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
+
+
+#define MMF_HAS_MDWE_NO_INHERIT 29
+
+#define MMF_VM_MERGE_ANY 30
+#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
+
+#define MMF_TOPDOWN 31 /* mm searches top down by default */
+#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN)
+
+#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
+ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+ if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+ flags &= ~((1UL << MMF_HAS_MDWE) |
+ (1UL << MMF_HAS_MDWE_NO_INHERIT));
+ return flags & MMF_INIT_MASK;
+}
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d39ebb10caeb..e2dd57ca368b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -606,6 +606,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ return 0;
+}
+
static inline int mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 80bc5640bb60..b36124145a16 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -220,6 +220,9 @@ enum node_stat_item {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+#ifdef CONFIG_HUGETLB_PAGE
+ NR_HUGETLB,
+#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -403,6 +406,8 @@ enum {
NR_LRU_GEN_CAPS
};
+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
diff --git a/include/linux/module.h b/include/linux/module.h
index 88ecc5e9f523..2a9386cbdf85 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -367,6 +367,8 @@ enum mod_mem_type {
struct module_memory {
void *base;
+ void *rw_copy;
+ bool is_rox;
unsigned int size;
#ifdef CONFIG_MODULES_TREE_LOOKUP
@@ -767,6 +769,15 @@ static inline bool is_livepatch_module(struct module *mod)
void set_module_sig_enforced(void);
+void *__module_writable_address(struct module *mod, void *loc);
+
+static inline void *module_writable_address(struct module *mod, void *loc)
+{
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod)
+ return loc;
+ return __module_writable_address(mod, loc);
+}
+
#else /* !CONFIG_MODULES... */
static inline struct module *__module_address(unsigned long addr)
@@ -874,6 +885,11 @@ static inline bool module_is_coming(struct module *mod)
{
return false;
}
+
+static inline void *module_writable_address(struct module *mod, void *loc)
+{
+ return loc;
+}
#endif /* CONFIG_MODULES */
#ifdef CONFIG_SYSFS
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index e395461d59e5..1f5507ba5a12 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -108,6 +108,10 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *mod);
+int module_post_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *mod);
+
#ifdef CONFIG_MODULES
void flush_module_init_free_work(void);
#else
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7d0c9c48a0c5..1e0fc6931ce9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -7,7 +7,6 @@
#include <linux/types.h>
#include <linux/nodemask.h>
#include <uapi/linux/oom.h>
-#include <linux/sched/coredump.h> /* MMF_* */
#include <linux/mm.h> /* VM_FAULT* */
struct zonelist;
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 7d79818dc065..4f5c9e979bb9 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -111,5 +111,12 @@
ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
+#define NR_NON_PAGEFLAG_BITS (SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + \
+ LAST_CPUPID_SHIFT + KASAN_TAG_WIDTH + \
+ LRU_GEN_WIDTH + LRU_REFS_WIDTH)
+
+#define NR_UNUSED_PAGEFLAG_BITS (BITS_PER_LONG - \
+ (NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS))
+
#endif
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 908ee0aad554..2220bfec278e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -689,6 +689,13 @@ static __always_inline bool folio_test_anon(const struct folio *folio)
return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
}
+static __always_inline bool PageAnonNotKsm(const struct page *page)
+{
+ unsigned long flags = (unsigned long)page_folio(page)->mapping;
+
+ return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON;
+}
+
static __always_inline bool PageAnon(const struct page *page)
{
return folio_test_anon(page_folio(page));
@@ -718,13 +725,8 @@ static __always_inline bool folio_test_ksm(const struct folio *folio)
return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
PAGE_MAPPING_KSM;
}
-
-static __always_inline bool PageKsm(const struct page *page)
-{
- return folio_test_ksm(page_folio(page));
-}
#else
-TESTPAGEFLAG_FALSE(Ksm, ksm)
+FOLIO_TEST_FLAG_FALSE(ksm)
#endif
u64 stable_page_flags(const struct page *page);
@@ -1137,14 +1139,14 @@ static __always_inline int PageAnonExclusive(const struct page *page)
static __always_inline void SetPageAnonExclusive(struct page *page)
{
- VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
+ VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}
static __always_inline void ClearPageAnonExclusive(struct page *page)
{
- VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
+ VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index c16db0067090..73dc2c1841ec 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -3,10 +3,6 @@
#define __LINUX_PAGEISOLATION_H
#ifdef CONFIG_MEMORY_ISOLATION
-static inline bool has_isolate_pageblock(struct zone *zone)
-{
- return zone->nr_isolate_pageblock;
-}
static inline bool is_migrate_isolate_page(struct page *page)
{
return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
@@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype)
return migratetype == MIGRATE_ISOLATE;
}
#else
-static inline bool has_isolate_pageblock(struct zone *zone)
-{
- return false;
-}
static inline bool is_migrate_isolate_page(struct page *page)
{
return false;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 68a5f1ff3301..bcf0865a38ae 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1011,22 +1011,25 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
return read_cache_folio(mapping, index, NULL, file);
}
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
+/**
+ * page_pgoff - Calculate the logical page offset of this page.
+ * @folio: The folio containing this page.
+ * @page: The page which we need the offset of.
+ *
+ * For file pages, this is the offset from the beginning of the file
+ * in units of PAGE_SIZE. For anonymous pages, this is the offset from
+ * the beginning of the anon_vma in units of PAGE_SIZE. This will
+ * return nonsense for KSM pages.
+ *
+ * Context: Caller must have a reference on the folio or otherwise
+ * prevent it from being split or freed.
+ *
+ * Return: The offset in units of PAGE_SIZE.
*/
-static inline pgoff_t page_to_pgoff(struct page *page)
+static inline pgoff_t page_pgoff(const struct folio *folio,
+ const struct page *page)
{
- struct page *head;
-
- if (likely(!PageTransTail(page)))
- return page->index;
-
- head = compound_head(page);
- /*
- * We don't initialize ->index for tail pages: calculate based on
- * head page
- */
- return head->index + page - head;
+ return folio->index + folio_page_idx(folio, page);
}
/*
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index f5eb5a32aeed..9700a29f8afb 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -25,12 +25,15 @@ enum page_walk_lock {
* this handler is required to be able to handle
* pmd_trans_huge() pmds. They may simply choose to
* split_huge_page() instead of handling it explicitly.
- * @pte_entry: if set, called for each PTE (lowest-level) entry,
- * including empty ones
+ * @pte_entry: if set, called for each PTE (lowest-level) entry
+ * including empty ones, except if @install_pte is set.
+ * If @install_pte is set, @pte_entry is called only for
+ * existing PTEs.
* @pte_hole: if set, called for each hole at all levels,
* depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
* Any folded depths (where PTRS_PER_P?D is equal to 1)
- * are skipped.
+ * are skipped. If @install_pte is specified, this will
+ * not trigger for any populated ranges.
* @hugetlb_entry: if set, called for each hugetlb entry. This hook
* function is called with the vma lock held, in order to
* protect against a concurrent freeing of the pte_t* or
@@ -51,6 +54,13 @@ enum page_walk_lock {
* @pre_vma: if set, called before starting walk on a non-null vma.
* @post_vma: if set, called after a walk on a non-null vma, provided
* that @pre_vma and the vma walk succeeded.
+ * @install_pte: if set, missing page table entries are installed and
+ * thus all levels are always walked in the specified
+ * range. This callback is then invoked at the PTE level
+ * (having split any THP pages prior), providing the PTE to
+ * install. If allocations fail, the walk is aborted. This
+ * operation is only available for userland memory. Not
+ * usable for hugetlb ranges.
*
* p?d_entry callbacks are called even if those levels are folded on a
* particular architecture/configuration.
@@ -76,6 +86,8 @@ struct mm_walk_ops {
int (*pre_vma)(unsigned long start, unsigned long end,
struct mm_walk *walk);
void (*post_vma)(struct mm_walk *walk);
+ int (*install_pte)(unsigned long addr, unsigned long next,
+ pte_t *ptep, struct mm_walk *walk);
enum page_walk_lock walk_lock;
};
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 59a3deb792a8..0e43ab653ab6 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -12,45 +12,166 @@
#include <linux/page_ext.h>
extern struct page_ext_operations page_alloc_tagging_ops;
+extern unsigned long alloc_tag_ref_mask;
+extern int alloc_tag_ref_offs;
+extern struct alloc_tag_kernel_section kernel_tags;
-static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext)
+DECLARE_STATIC_KEY_FALSE(mem_profiling_compressed);
+
+typedef u16 pgalloc_tag_idx;
+
+union pgtag_ref_handle {
+ union codetag_ref *ref; /* reference in page extension */
+ struct page *page; /* reference in page flags */
+};
+
+/* Reserved indexes */
+#define CODETAG_ID_NULL 0
+#define CODETAG_ID_EMPTY 1
+#define CODETAG_ID_FIRST 2
+
+#ifdef CONFIG_MODULES
+
+extern struct alloc_tag_module_section module_tags;
+
+static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx)
{
- return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops);
+ return &module_tags.first_tag[idx - kernel_tags.count];
}
-static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref)
+static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag)
{
- return (void *)ref - page_alloc_tagging_ops.offset;
+ return CODETAG_ID_FIRST + kernel_tags.count + (tag - module_tags.first_tag);
}
-/* Should be called only if mem_alloc_profiling_enabled() */
-static inline union codetag_ref *get_page_tag_ref(struct page *page)
+#else /* CONFIG_MODULES */
+
+static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx)
{
- if (page) {
- struct page_ext *page_ext = page_ext_get(page);
+ pr_warn("invalid page tag reference %lu\n", (unsigned long)idx);
+ return NULL;
+}
- if (page_ext)
- return codetag_ref_from_page_ext(page_ext);
+static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag)
+{
+ pr_warn("invalid page tag 0x%lx\n", (unsigned long)tag);
+ return CODETAG_ID_NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
+static inline void idx_to_ref(pgalloc_tag_idx idx, union codetag_ref *ref)
+{
+ switch (idx) {
+ case (CODETAG_ID_NULL):
+ ref->ct = NULL;
+ break;
+ case (CODETAG_ID_EMPTY):
+ set_codetag_empty(ref);
+ break;
+ default:
+ idx -= CODETAG_ID_FIRST;
+ ref->ct = idx < kernel_tags.count ?
+ &kernel_tags.first_tag[idx].ct :
+ &module_idx_to_tag(idx)->ct;
+ break;
}
- return NULL;
}
-static inline void put_page_tag_ref(union codetag_ref *ref)
+static inline pgalloc_tag_idx ref_to_idx(union codetag_ref *ref)
+{
+ struct alloc_tag *tag;
+
+ if (!ref->ct)
+ return CODETAG_ID_NULL;
+
+ if (is_codetag_empty(ref))
+ return CODETAG_ID_EMPTY;
+
+ tag = ct_to_alloc_tag(ref->ct);
+ if (tag >= kernel_tags.first_tag && tag < kernel_tags.first_tag + kernel_tags.count)
+ return CODETAG_ID_FIRST + (tag - kernel_tags.first_tag);
+
+ return module_tag_to_idx(tag);
+}
+
+
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref,
+ union pgtag_ref_handle *handle)
{
- if (WARN_ON(!ref))
+ if (!page)
+ return false;
+
+ if (static_key_enabled(&mem_profiling_compressed)) {
+ pgalloc_tag_idx idx;
+
+ idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask;
+ idx_to_ref(idx, ref);
+ handle->page = page;
+ } else {
+ struct page_ext *page_ext;
+ union codetag_ref *tmp;
+
+ page_ext = page_ext_get(page);
+ if (!page_ext)
+ return false;
+
+ tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops);
+ ref->ct = tmp->ct;
+ handle->ref = tmp;
+ }
+
+ return true;
+}
+
+static inline void put_page_tag_ref(union pgtag_ref_handle handle)
+{
+ if (WARN_ON(!handle.ref))
return;
- page_ext_put(page_ext_from_codetag_ref(ref));
+ if (!static_key_enabled(&mem_profiling_compressed))
+ page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset);
+}
+
+static inline void update_page_tag_ref(union pgtag_ref_handle handle, union codetag_ref *ref)
+{
+ if (static_key_enabled(&mem_profiling_compressed)) {
+ struct page *page = handle.page;
+ unsigned long old_flags;
+ unsigned long flags;
+ unsigned long idx;
+
+ if (WARN_ON(!page || !ref))
+ return;
+
+ idx = (unsigned long)ref_to_idx(ref);
+ idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs;
+ do {
+ old_flags = READ_ONCE(page->flags);
+ flags = old_flags;
+ flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs);
+ flags |= idx;
+ } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } else {
+ if (WARN_ON(!handle.ref || !ref))
+ return;
+
+ handle.ref->ct = ref->ct;
+ }
}
static inline void clear_page_tag_ref(struct page *page)
{
if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
- if (ref) {
- set_codetag_empty(ref);
- put_page_tag_ref(ref);
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
}
}
}
@@ -59,11 +180,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr)
{
if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
- if (ref) {
- alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr);
- put_page_tag_ref(ref);
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
}
}
}
@@ -71,11 +194,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
{
if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
- if (ref) {
- alloc_tag_sub(ref, PAGE_SIZE * nr);
- put_page_tag_ref(ref);
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
}
}
}
@@ -85,13 +210,14 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
struct alloc_tag *tag = NULL;
if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
-
- alloc_tag_sub_check(ref);
- if (ref) {
- if (ref->ct)
- tag = ct_to_alloc_tag(ref->ct);
- put_page_tag_ref(ref);
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub_check(&ref);
+ if (ref.ct)
+ tag = ct_to_alloc_tag(ref.ct);
+ put_page_tag_ref(handle);
}
}
@@ -104,16 +230,22 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}
+void pgalloc_tag_split(struct folio *folio, int old_order, int new_order);
+void pgalloc_tag_copy(struct folio *new, struct folio *old);
+
+void __init alloc_tag_sec_init(void);
+
#else /* CONFIG_MEM_ALLOC_PROFILING */
-static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; }
-static inline void put_page_tag_ref(union codetag_ref *ref) {}
static inline void clear_page_tag_ref(struct page *page) {}
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+static inline void alloc_tag_sec_init(void) {}
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e8b2ac6bd2ae..adef9d6e9b1b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -90,6 +90,27 @@ static inline unsigned long pud_index(unsigned long address)
#define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif
+#ifndef kernel_pte_init
+static inline void kernel_pte_init(void *addr)
+{
+}
+#define kernel_pte_init kernel_pte_init
+#endif
+
+#ifndef pmd_init
+static inline void pmd_init(void *addr)
+{
+}
+#define pmd_init pmd_init
+#endif
+
+#ifndef pud_init
+static inline void pud_init(void *addr)
+{
+}
+#define pud_init pud_init
+#endif
+
#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
@@ -1056,44 +1077,6 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
}
#endif
-/*
- * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
- * TLB flush will be required as a result of the "set". For example, use
- * in scenarios where it is known ahead of time that the routine is
- * setting non-present entries, or re-setting an existing entry to the
- * same value. Otherwise, use the typical "set" helpers and flush the
- * TLB.
- */
-#define set_pte_safe(ptep, pte) \
-({ \
- WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
- set_pte(ptep, pte); \
-})
-
-#define set_pmd_safe(pmdp, pmd) \
-({ \
- WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
- set_pmd(pmdp, pmd); \
-})
-
-#define set_pud_safe(pudp, pud) \
-({ \
- WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
- set_pud(pudp, pud); \
-})
-
-#define set_p4d_safe(p4dp, p4d) \
-({ \
- WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
- set_p4d(p4dp, p4d); \
-})
-
-#define set_pgd_safe(pgdp, pgd) \
-({ \
- WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
- set_pgd(pgdp, pgd); \
-})
-
#ifndef __HAVE_ARCH_DO_SWAP_PAGE
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
struct vm_area_struct *vma,
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d5e93e44322e..683a04088f3f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -171,7 +171,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
unlink_anon_vmas(next);
}
-struct anon_vma *folio_get_anon_vma(struct folio *folio);
+struct anon_vma *folio_get_anon_vma(const struct folio *folio);
/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;
@@ -194,8 +194,8 @@ enum rmap_level {
RMAP_LEVEL_PMD,
};
-static inline void __folio_rmap_sanity_checks(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level)
+static inline void __folio_rmap_sanity_checks(const struct folio *folio,
+ const struct page *page, int nr_pages, enum rmap_level level)
{
/* hugetlb folios are handled separately. */
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
@@ -728,11 +728,8 @@ page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
}
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
-
-/*
- * Used by swapoff to help locate where page is expected in vma.
- */
-unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
+unsigned long page_address_in_vma(const struct folio *folio,
+ const struct page *, const struct vm_area_struct *);
/*
* Cleans the PTEs of shared mappings.
@@ -774,14 +771,14 @@ struct rmap_walk_control {
bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, void *arg);
int (*done)(struct folio *folio);
- struct anon_vma *(*anon_lock)(struct folio *folio,
+ struct anon_vma *(*anon_lock)(const struct folio *folio,
struct rmap_walk_control *rwc);
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
struct rmap_walk_control *rwc);
#else /* !CONFIG_MMU */
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index e62ff805cfc9..6eb65ceed213 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,12 +8,6 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
-/* mm flags */
-
-/* for SUID_DUMP_* above */
-#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
-
extern void set_dumpable(struct mm_struct *mm, int value);
/*
* This returns the actual value of the suid_dumpable flag. For things
@@ -31,80 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm)
return __get_dumpable(mm->flags);
}
-/* coredump filter bits */
-#define MMF_DUMP_ANON_PRIVATE 2
-#define MMF_DUMP_ANON_SHARED 3
-#define MMF_DUMP_MAPPED_PRIVATE 4
-#define MMF_DUMP_MAPPED_SHARED 5
-#define MMF_DUMP_ELF_HEADERS 6
-#define MMF_DUMP_HUGETLB_PRIVATE 7
-#define MMF_DUMP_HUGETLB_SHARED 8
-#define MMF_DUMP_DAX_PRIVATE 9
-#define MMF_DUMP_DAX_SHARED 10
-
-#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS 9
-#define MMF_DUMP_FILTER_MASK \
- (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
-#define MMF_DUMP_FILTER_DEFAULT \
- ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
- (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
-
-#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
-#else
-# define MMF_DUMP_MASK_DEFAULT_ELF 0
-#endif
- /* leave room for more dump flags */
-#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
-#define MMF_VM_HUGEPAGE 17 /* set when mm is available for
- khugepaged */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
-
-#define MMF_HAS_UPROBES 19 /* has uprobes */
-#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
-#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
-#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
-#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
-/*
- * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either
- * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
- * a counter on its own. We're aggresive on this bit for now: even if the
- * pinned pages were unpinned later on, we'll still keep this bit set for the
- * lifecycle of this mm, just for simplicity.
- */
-#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
-
-#define MMF_HAS_MDWE 28
-#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
-
-
-#define MMF_HAS_MDWE_NO_INHERIT 29
-
-#define MMF_VM_MERGE_ANY 30
-#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
-
-#define MMF_TOPDOWN 31 /* mm searches top down by default */
-#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN)
-
-#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
- MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
- MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
-
-static inline unsigned long mmf_init_flags(unsigned long flags)
-{
- if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
- flags &= ~((1UL << MMF_HAS_MDWE) |
- (1UL << MMF_HAS_MDWE_NO_INHERIT));
- return flags & MMF_INIT_MASK;
-}
-
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index e7aec20fb44f..3030d9245f5a 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -34,6 +34,12 @@ static inline int set_direct_map_default_noflush(struct page *page)
return 0;
}
+static inline int set_direct_map_valid_noflush(struct page *page,
+ unsigned nr, bool valid)
+{
+ return 0;
+}
+
static inline bool kernel_page_present(struct page *page)
{
return true;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 018da28c01e7..0b273a7b9f01 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -114,6 +114,7 @@ int shmem_unuse(unsigned int type);
unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
loff_t write_end, bool shmem_huge_force);
+bool shmem_hpage_pmd_enabled(void);
#else
static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
@@ -121,6 +122,11 @@ static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
{
return 0;
}
+
+static inline bool shmem_hpage_pmd_enabled(void)
+{
+ return false;
+}
#endif
#ifdef CONFIG_SHMEM
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cb468e418ea1..96f26e29fefe 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -426,9 +426,19 @@ typedef unsigned long pte_marker;
* "Poisoned" here is meant in the very general sense of "future accesses are
* invalid", instead of referring very specifically to hardware memory errors.
* This marker is meant to represent any of various different causes of this.
+ *
+ * Note that, when encountered by the faulting logic, PTEs with this marker will
+ * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
+ * logic.
*/
#define PTE_MARKER_POISONED BIT(1)
-#define PTE_MARKER_MASK (BIT(2) - 1)
+/*
+ * Indicates that, on fault, this PTE will case a SIGSEGV signal to be
+ * sent. This means guard markers behave in effect as if the region were mapped
+ * PROT_NONE, rather than if they were a memory hole or equivalent.
+ */
+#define PTE_MARKER_GUARD BIT(2)
+#define PTE_MARKER_MASK (BIT(3) - 1)
static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
{
@@ -464,6 +474,18 @@ static inline int is_poisoned_swp_entry(swp_entry_t entry)
{
return is_pte_marker_entry(entry) &&
(pte_marker_get(entry) & PTE_MARKER_POISONED);
+
+}
+
+static inline swp_entry_t make_guard_swp_entry(void)
+{
+ return make_pte_marker_entry(PTE_MARKER_GUARD);
+}
+
+static inline int is_guard_swp_entry(swp_entry_t entry)
+{
+ return is_pte_marker_entry(entry) &&
+ (pte_marker_get(entry) & PTE_MARKER_GUARD);
}
/*
diff --git a/include/linux/text-patching.h b/include/linux/text-patching.h
new file mode 100644
index 000000000000..ad5877ab0855
--- /dev/null
+++ b/include/linux/text-patching.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TEXT_PATCHING_H
+#define _LINUX_TEXT_PATCHING_H
+
+#include <asm/text-patching.h>
+
+#ifndef text_poke_copy
+static inline void *text_poke_copy(void *dst, const void *src, size_t len)
+{
+ return memcpy(dst, src, len);
+}
+#define text_poke_copy text_poke_copy
+#endif
+
+#endif /* _LINUX_TEXT_PATCHING_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index ad2ce7a6ab7a..31e9ffd936e3 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -134,12 +134,6 @@ extern void vm_unmap_ram(const void *mem, unsigned int count);
extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);
-#ifdef CONFIG_MMU
-extern unsigned long vmalloc_nr_pages(void);
-#else
-static inline unsigned long vmalloc_nr_pages(void) { return 0; }
-#endif
-
extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
#define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__))
@@ -208,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff);
+int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
+ struct page **pages, unsigned int page_shift);
+
/*
* Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
* and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
@@ -266,12 +263,29 @@ static inline bool is_vm_area_hugepages(const void *addr)
#endif
}
+/* for /proc/kcore */
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
+
+/*
+ * Internals. Don't use..
+ */
+__init void vm_area_add_early(struct vm_struct *vm);
+__init void vm_area_register_early(struct vm_struct *vm, size_t align);
+
+int register_vmap_purge_notifier(struct notifier_block *nb);
+int unregister_vmap_purge_notifier(struct notifier_block *nb);
+
#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+
+unsigned long vmalloc_nr_pages(void);
+
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
unsigned long end, struct page **pages);
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
unsigned long end);
void vunmap_range(unsigned long addr, unsigned long end);
+
static inline void set_vm_flush_reset_perms(void *addr)
{
struct vm_struct *vm = find_vm_area(addr);
@@ -279,24 +293,14 @@ static inline void set_vm_flush_reset_perms(void *addr)
if (vm)
vm->flags |= VM_FLUSH_RESET_PERMS;
}
+#else /* !CONFIG_MMU */
+#define VMALLOC_TOTAL 0UL
-#else
-static inline void set_vm_flush_reset_perms(void *addr)
-{
-}
-#endif
-
-/* for /proc/kcore */
-extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
-
-/*
- * Internals. Don't use..
- */
-extern __init void vm_area_add_early(struct vm_struct *vm);
-extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
+static inline unsigned long vmalloc_nr_pages(void) { return 0; }
+static inline void set_vm_flush_reset_perms(void *addr) {}
+#endif /* CONFIG_MMU */
-#ifdef CONFIG_SMP
-# ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && defined(CONFIG_SMP)
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
size_t align);
@@ -311,22 +315,9 @@ pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
-static inline void
-pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
-{
-}
-# endif
-#endif
-
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-#else
-#define VMALLOC_TOTAL 0UL
+static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {}
#endif
-int register_vmap_purge_notifier(struct notifier_block *nb);
-int unregister_vmap_purge_notifier(struct notifier_block *nb);
-
#if defined(CONFIG_MMU) && defined(CONFIG_PRINTK)
bool vmalloc_dump_obj(void *object);
#else
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 9cd1beef0654..d961ead91bf1 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -7,7 +7,7 @@
struct lruvec;
-extern atomic_t zswap_stored_pages;
+extern atomic_long_t zswap_stored_pages;
#ifdef CONFIG_ZSWAP
diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
new file mode 100644
index 000000000000..dfe2f51019b4
--- /dev/null
+++ b/include/trace/events/memcg.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM memcg
+
+#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MEMCG_H
+
+#include <linux/memcontrol.h>
+#include <linux/tracepoint.h>
+
+
+DECLARE_EVENT_CLASS(memcg_rstat_stats,
+
+ TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+ TP_ARGS(memcg, item, val),
+
+ TP_STRUCT__entry(
+ __field(u64, id)
+ __field(int, item)
+ __field(int, val)
+ ),
+
+ TP_fast_assign(
+ __entry->id = cgroup_id(memcg->css.cgroup);
+ __entry->item = item;
+ __entry->val = val;
+ ),
+
+ TP_printk("memcg_id=%llu item=%d val=%d",
+ __entry->id, __entry->item, __entry->val)
+);
+
+DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state,
+
+ TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+ TP_ARGS(memcg, item, val)
+);
+
+DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state,
+
+ TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+
+ TP_ARGS(memcg, item, val)
+);
+
+DECLARE_EVENT_CLASS(memcg_rstat_events,
+
+ TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val),
+
+ TP_ARGS(memcg, item, val),
+
+ TP_STRUCT__entry(
+ __field(u64, id)
+ __field(int, item)
+ __field(unsigned long, val)
+ ),
+
+ TP_fast_assign(
+ __entry->id = cgroup_id(memcg->css.cgroup);
+ __entry->item = item;
+ __entry->val = val;
+ ),
+
+ TP_printk("memcg_id=%llu item=%d val=%lu",
+ __entry->id, __entry->item, __entry->val)
+);
+
+DEFINE_EVENT(memcg_rstat_events, count_memcg_events,
+
+ TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val),
+
+ TP_ARGS(memcg, item, val)
+);
+
+TRACE_EVENT(memcg_flush_stats,
+
+ TP_PROTO(struct mem_cgroup *memcg, s64 stats_updates,
+ bool force, bool needs_flush),
+
+ TP_ARGS(memcg, stats_updates, force, needs_flush),
+
+ TP_STRUCT__entry(
+ __field(u64, id)
+ __field(s64, stats_updates)
+ __field(bool, force)
+ __field(bool, needs_flush)
+ ),
+
+ TP_fast_assign(
+ __entry->id = cgroup_id(memcg->css.cgroup);
+ __entry->stats_updates = stats_updates;
+ __entry->force = force;
+ __entry->needs_flush = needs_flush;
+ ),
+
+ TP_printk("memcg_id=%llu stats_updates=%lld force=%d needs_flush=%d",
+ __entry->id, __entry->stats_updates,
+ __entry->force, __entry->needs_flush)
+);
+
+#endif /* _TRACE_MEMCG_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
index f2827f98a44f..bc2e3ad787b3 100644
--- a/include/trace/events/mmap_lock.h
+++ b/include/trace/events/mmap_lock.h
@@ -10,9 +10,6 @@
struct mm_struct;
-extern int trace_mmap_lock_reg(void);
-extern void trace_mmap_lock_unreg(void);
-
DECLARE_EVENT_CLASS(mmap_lock,
TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
@@ -40,16 +37,15 @@ DECLARE_EVENT_CLASS(mmap_lock,
);
#define DEFINE_MMAP_LOCK_EVENT(name) \
- DEFINE_EVENT_FN(mmap_lock, name, \
+ DEFINE_EVENT(mmap_lock, name, \
TP_PROTO(struct mm_struct *mm, const char *memcg_path, \
bool write), \
- TP_ARGS(mm, memcg_path, write), \
- trace_mmap_lock_reg, trace_mmap_lock_unreg)
+ TP_ARGS(mm, memcg_path, write))
DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);
-TRACE_EVENT_FN(mmap_lock_acquire_returned,
+TRACE_EVENT(mmap_lock_acquire_returned,
TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
bool success),
@@ -76,9 +72,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned,
__get_str(memcg_path),
__entry->write ? "true" : "false",
__entry->success ? "true" : "false"
- ),
-
- trace_mmap_lock_reg, trace_mmap_lock_unreg
+ )
);
#endif /* _TRACE_MMAP_LOCK_H */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 1a488c30afa5..490958fa10de 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -346,6 +346,51 @@ TRACE_EVENT(mm_vmscan_write_folio,
show_reclaim_flags(__entry->reclaim_flags))
);
+TRACE_EVENT(mm_vmscan_reclaim_pages,
+
+ TP_PROTO(int nid,
+ unsigned long nr_scanned, unsigned long nr_reclaimed,
+ struct reclaim_stat *stat),
+
+ TP_ARGS(nid, nr_scanned, nr_reclaimed, stat),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(unsigned long, nr_scanned)
+ __field(unsigned long, nr_reclaimed)
+ __field(unsigned long, nr_dirty)
+ __field(unsigned long, nr_writeback)
+ __field(unsigned long, nr_congested)
+ __field(unsigned long, nr_immediate)
+ __field(unsigned int, nr_activate0)
+ __field(unsigned int, nr_activate1)
+ __field(unsigned long, nr_ref_keep)
+ __field(unsigned long, nr_unmap_fail)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->nr_scanned = nr_scanned;
+ __entry->nr_reclaimed = nr_reclaimed;
+ __entry->nr_dirty = stat->nr_dirty;
+ __entry->nr_writeback = stat->nr_writeback;
+ __entry->nr_congested = stat->nr_congested;
+ __entry->nr_immediate = stat->nr_immediate;
+ __entry->nr_activate0 = stat->nr_activate[0];
+ __entry->nr_activate1 = stat->nr_activate[1];
+ __entry->nr_ref_keep = stat->nr_ref_keep;
+ __entry->nr_unmap_fail = stat->nr_unmap_fail;
+ ),
+
+ TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld",
+ __entry->nid,
+ __entry->nr_scanned, __entry->nr_reclaimed,
+ __entry->nr_dirty, __entry->nr_writeback,
+ __entry->nr_congested, __entry->nr_immediate,
+ __entry->nr_activate0, __entry->nr_activate1,
+ __entry->nr_ref_keep, __entry->nr_unmap_fail)
+);
+
TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
TP_PROTO(int nid,
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..1ea2c4c33b86 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
+#define MADV_GUARD_REMOVE 103 /* unguard range */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a76ddc5fc982..fa04b14a7d72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index e58d27c05788..f253e81d0c28 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,7 +16,6 @@
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
@@ -1546,8 +1545,9 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
return ERR_PTR(err);
mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ if (!mm) {
+ mm = ERR_PTR(-ESRCH);
+ } else if (mm != current->mm && !ptrace_may_access(task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 326bfe6549d7..6de57246760e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -399,7 +399,7 @@ again:
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = folio->index + folio_page_idx(folio, page);
+ key->shared.pgoff = page_pgoff(folio, page);
rcu_read_unlock();
}
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
index b4cc03842d70..df873dad049d 100644
--- a/kernel/module/debug_kmemleak.c
+++ b/kernel/module/debug_kmemleak.c
@@ -14,7 +14,8 @@ void kmemleak_load_module(const struct module *mod,
{
/* only scan writable, non-executable sections */
for_each_mod_mem_type(type) {
- if (type != MOD_DATA && type != MOD_INIT_DATA)
+ if (type != MOD_DATA && type != MOD_INIT_DATA &&
+ !mod->mem[type].is_rox)
kmemleak_no_scan(mod->mem[type].base);
}
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 4490924fe24e..d2e1b8976c7b 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1189,6 +1189,18 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
+void *__module_writable_address(struct module *mod, void *loc)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (loc >= mem->base && loc < mem->base + mem->size)
+ return loc + (mem->rw_copy - mem->base);
+ }
+
+ return loc;
+}
+
static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1206,6 +1218,23 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
if (!ptr)
return -ENOMEM;
+ mod->mem[type].base = ptr;
+
+ if (execmem_is_rox(execmem_type)) {
+ ptr = vzalloc(size);
+
+ if (!ptr) {
+ execmem_free(mod->mem[type].base);
+ return -ENOMEM;
+ }
+
+ mod->mem[type].rw_copy = ptr;
+ mod->mem[type].is_rox = true;
+ } else {
+ mod->mem[type].rw_copy = mod->mem[type].base;
+ memset(mod->mem[type].base, 0, size);
+ }
+
/*
* The pointer to these blocks of memory are stored on the module
* structure and we keep that around so long as the module is
@@ -1219,24 +1248,20 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
*/
kmemleak_not_leak(ptr);
- memset(ptr, 0, size);
- mod->mem[type].base = ptr;
-
return 0;
}
-static void module_memory_free(struct module *mod, enum mod_mem_type type,
- bool unload_codetags)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
{
- void *ptr = mod->mem[type].base;
+ struct module_memory *mem = &mod->mem[type];
- if (!unload_codetags && mod_mem_type_is_core_data(type))
- return;
+ if (mem->is_rox)
+ vfree(mem->rw_copy);
- execmem_free(ptr);
+ execmem_free(mem->base);
}
-static void free_mod_mem(struct module *mod, bool unload_codetags)
+static void free_mod_mem(struct module *mod)
{
for_each_mod_mem_type(type) {
struct module_memory *mod_mem = &mod->mem[type];
@@ -1247,25 +1272,20 @@ static void free_mod_mem(struct module *mod, bool unload_codetags)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
- module_memory_free(mod, type, unload_codetags);
+ module_memory_free(mod, type);
}
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
- module_memory_free(mod, MOD_DATA, unload_codetags);
+ module_memory_free(mod, MOD_DATA);
}
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
- bool unload_codetags;
-
trace_module_free(mod);
- unload_codetags = codetag_unload_module(mod);
- if (!unload_codetags)
- pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
- mod->name);
+ codetag_unload_module(mod);
mod_sysfs_teardown(mod);
@@ -1308,7 +1328,7 @@ static void free_module(struct module *mod)
kfree(mod->args);
percpu_modfree(mod);
- free_mod_mem(mod, unload_codetags);
+ free_mod_mem(mod);
}
void *__symbol_get(const char *symbol)
@@ -1573,6 +1593,20 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i
if (WARN_ON_ONCE(type == MOD_INVALID))
continue;
+ /*
+ * Do not allocate codetag memory as we load it into
+ * preallocated contiguous memory.
+ */
+ if (codetag_needs_module_section(mod, sname, s->sh_size)) {
+ /*
+ * s->sh_entsize won't be used but populate the
+ * type field to avoid confusion.
+ */
+ s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK)
+ << SH_ENTSIZE_TYPE_SHIFT;
+ continue;
+ }
+
s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
pr_debug("\t%s\n", sname);
}
@@ -2247,17 +2281,19 @@ static int move_module(struct module *mod, struct load_info *info)
int i;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
+ bool codetag_section_found = false;
for_each_mod_mem_type(type) {
if (!mod->mem[type].size) {
mod->mem[type].base = NULL;
+ mod->mem[type].rw_copy = NULL;
continue;
}
ret = module_memory_alloc(mod, type);
if (ret) {
t = type;
- goto out_enomem;
+ goto out_err;
}
}
@@ -2266,12 +2302,37 @@ static int move_module(struct module *mod, struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
- enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ const char *sname;
+ unsigned long addr;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
- dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
+ sname = info->secstrings + shdr->sh_name;
+ /*
+ * Load codetag sections separately as they might still be used
+ * after module unload.
+ */
+ if (codetag_needs_module_section(mod, sname, shdr->sh_size)) {
+ dest = codetag_alloc_module_section(mod, sname, shdr->sh_size,
+ arch_mod_section_prepend(mod, i), shdr->sh_addralign);
+ if (WARN_ON(!dest)) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ if (IS_ERR(dest)) {
+ ret = PTR_ERR(dest);
+ goto out_err;
+ }
+ addr = (unsigned long)dest;
+ codetag_section_found = true;
+ } else {
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+
+ addr = (unsigned long)mod->mem[type].base + offset;
+ dest = mod->mem[type].rw_copy + offset;
+ }
if (shdr->sh_type != SHT_NOBITS) {
/*
@@ -2283,7 +2344,7 @@ static int move_module(struct module *mod, struct load_info *info)
if (i == info->index.mod &&
(WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
ret = -ENOEXEC;
- goto out_enomem;
+ goto out_err;
}
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
}
@@ -2293,15 +2354,18 @@ static int move_module(struct module *mod, struct load_info *info)
* users of info can keep taking advantage and using the newly
* minted official memory area.
*/
- shdr->sh_addr = (unsigned long)dest;
+ shdr->sh_addr = addr;
pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
(long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
-out_enomem:
+out_err:
for (t--; t >= 0; t--)
- module_memory_free(mod, t, true);
+ module_memory_free(mod, t);
+ if (codetag_section_found)
+ codetag_free_module_sections(mod);
+
return ret;
}
@@ -2422,6 +2486,8 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
+ codetag_module_replaced(info->mod, mod);
+
return mod;
}
@@ -2431,7 +2497,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
percpu_modfree(mod);
module_arch_freeing_init(mod);
- free_mod_mem(mod, true);
+ free_mod_mem(mod);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2441,8 +2507,17 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
return 0;
}
+int __weak module_post_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
static int post_relocation(struct module *mod, const struct load_info *info)
{
+ int ret;
+
/* Sort exception table now relocations are done. */
sort_extable(mod->extable, mod->extable + mod->num_exentries);
@@ -2454,7 +2529,24 @@ static int post_relocation(struct module *mod, const struct load_info *info)
add_kallsyms(mod, info);
/* Arch-specific module finalizing. */
- return module_finalize(info->hdr, info->sechdrs, mod);
+ ret = module_finalize(info->hdr, info->sechdrs, mod);
+ if (ret)
+ return ret;
+
+ for_each_mod_mem_type(type) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox) {
+ if (!execmem_update_copy(mem->base, mem->rw_copy,
+ mem->size))
+ return -ENOMEM;
+
+ vfree(mem->rw_copy);
+ mem->rw_copy = NULL;
+ }
+ }
+
+ return module_post_finalize(info->hdr, info->sechdrs, mod);
}
/* Call module constructors. */
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index c45caa4690e5..239e5013359d 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -34,6 +34,9 @@ int module_enable_text_rox(const struct module *mod)
for_class_mod_mem_type(type, text) {
int ret;
+ if (mod->mem[type].is_rox)
+ continue;
+
if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
ret = module_set_memory(mod, type, set_memory_rox);
else
diff --git a/kernel/resource.c b/kernel/resource.c
index 4101016e8b20..d2c8143ae4ff 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1869,7 +1869,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end, PHYSMEM_END);
+ end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
return end - size + 1;
}
@@ -1886,7 +1886,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
+ addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a2fe6d5cfbd2..5d9eca035d47 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -993,6 +993,7 @@ config CODE_TAGGING
config MEM_ALLOC_PROFILING
bool "Enable memory allocation profiling"
default n
+ depends on MMU
depends on PROC_FS
depends on !DEBUG_FORCE_WEAK_PER_CPU
select CODE_TAGGING
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 98016e137b7f..f82889a830fa 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -195,13 +195,6 @@ config KASAN_KUNIT_TEST
For more information on KUnit and unit tests in general, please refer
to the KUnit documentation in Documentation/dev-tools/kunit/.
-config KASAN_MODULE_TEST
- tristate "KUnit-incompatible tests of KASAN bug detection capabilities"
- depends on m && KASAN && !KASAN_HW_TAGS
- help
- A part of the KASAN test suite that is not integrated with KUnit.
- Incompatible with Hardware Tag-Based KASAN.
-
config KASAN_EXTRA_INFO
bool "Record and report more information"
depends on KASAN
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 81e5f9a70f22..2414a7ee7ec7 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -1,12 +1,26 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/alloc_tag.h>
+#include <linux/execmem.h>
#include <linux/fs.h>
#include <linux/gfp.h>
+#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/page_ext.h>
#include <linux/proc_fs.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+
+#define ALLOCINFO_FILE_NAME "allocinfo"
+#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag))
+#define SECTION_START(NAME) (CODETAG_SECTION_START_PREFIX NAME)
+#define SECTION_STOP(NAME) (CODETAG_SECTION_STOP_PREFIX NAME)
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+static bool mem_profiling_support = true;
+#else
+static bool mem_profiling_support;
+#endif
static struct codetag_type *alloc_tag_cttype;
@@ -15,6 +29,11 @@ EXPORT_SYMBOL(_shared_alloc_tag);
DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
mem_alloc_profiling_key);
+DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed);
+
+struct alloc_tag_kernel_section kernel_tags = { NULL, 0 };
+unsigned long alloc_tag_ref_mask;
+int alloc_tag_ref_offs;
struct allocinfo_private {
struct codetag_iterator iter;
@@ -144,44 +163,450 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl
return nr;
}
+void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+ int i;
+ struct alloc_tag *tag;
+ unsigned int nr_pages = 1 << new_order;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = pgalloc_tag_get(&folio->page);
+ if (!tag)
+ return;
+
+ for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
+ /* Set new reference to point to the original tag */
+ alloc_tag_ref_set(&ref, tag);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+ }
+}
+
+void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+ struct alloc_tag *tag;
+
+ tag = pgalloc_tag_get(&old->page);
+ if (!tag)
+ return;
+
+ if (!get_page_tag_ref(&new->page, &ref, &handle))
+ return;
+
+ /* Clear the old ref to the original allocation tag. */
+ clear_page_tag_ref(&old->page);
+ /* Decrement the counters of the tag on get_new_folio. */
+ alloc_tag_sub(&ref, folio_size(new));
+ __alloc_tag_ref_set(&ref, tag);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+}
+
+static void shutdown_mem_profiling(bool remove_file)
+{
+ if (mem_alloc_profiling_enabled())
+ static_branch_disable(&mem_alloc_profiling_key);
+
+ if (!mem_profiling_support)
+ return;
+
+ if (remove_file)
+ remove_proc_entry(ALLOCINFO_FILE_NAME, NULL);
+ mem_profiling_support = false;
+}
+
static void __init procfs_init(void)
{
- proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op);
+ if (!mem_profiling_support)
+ return;
+
+ if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) {
+ pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
+ shutdown_mem_profiling(false);
+ }
+}
+
+void __init alloc_tag_sec_init(void)
+{
+ struct alloc_tag *last_codetag;
+
+ if (!mem_profiling_support)
+ return;
+
+ if (!static_key_enabled(&mem_profiling_compressed))
+ return;
+
+ kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
+ SECTION_START(ALLOC_TAG_SECTION_NAME));
+ last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
+ SECTION_STOP(ALLOC_TAG_SECTION_NAME));
+ kernel_tags.count = last_codetag - kernel_tags.first_tag;
+
+ /* Check if kernel tags fit into page flags */
+ if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
+ shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
+ pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n",
+ kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS);
+ return;
+ }
+
+ alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS);
+ alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1);
+ pr_debug("Memory allocation profiling compression is using %d page flag bits!\n",
+ NR_UNUSED_PAGEFLAG_BITS);
+}
+
+#ifdef CONFIG_MODULES
+
+static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE);
+static struct vm_struct *vm_module_tags;
+/* A dummy object used to indicate an unloaded module */
+static struct module unloaded_mod;
+/* A dummy object used to indicate a module prepended area */
+static struct module prepend_mod;
+
+struct alloc_tag_module_section module_tags;
+
+static inline unsigned long alloc_tag_align(unsigned long val)
+{
+ if (!static_key_enabled(&mem_profiling_compressed)) {
+ /* No alignment requirements when we are not indexing the tags */
+ return val;
+ }
+
+ if (val % sizeof(struct alloc_tag) == 0)
+ return val;
+ return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag);
}
-static bool alloc_tag_module_unload(struct codetag_type *cttype,
- struct codetag_module *cmod)
+static bool ensure_alignment(unsigned long align, unsigned int *prepend)
{
- struct codetag_iterator iter = codetag_get_ct_iter(cttype);
- struct alloc_tag_counters counter;
- bool module_unused = true;
+ if (!static_key_enabled(&mem_profiling_compressed)) {
+ /* No alignment requirements when we are not indexing the tags */
+ return true;
+ }
+
+ /*
+ * If alloc_tag size is not a multiple of required alignment, tag
+ * indexing does not work.
+ */
+ if (!IS_ALIGNED(sizeof(struct alloc_tag), align))
+ return false;
+
+ /* Ensure prepend consumes multiple of alloc_tag-sized blocks */
+ if (*prepend)
+ *prepend = alloc_tag_align(*prepend);
+
+ return true;
+}
+
+static inline bool tags_addressable(void)
+{
+ unsigned long tag_idx_count;
+
+ if (!static_key_enabled(&mem_profiling_compressed))
+ return true; /* with page_ext tags are always addressable */
+
+ tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count +
+ module_tags.size / sizeof(struct alloc_tag);
+
+ return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS);
+}
+
+static bool needs_section_mem(struct module *mod, unsigned long size)
+{
+ if (!mem_profiling_support)
+ return false;
+
+ return size >= sizeof(struct alloc_tag);
+}
+
+static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to)
+{
+ while (from <= to) {
+ struct alloc_tag_counters counter;
+
+ counter = alloc_tag_read(from);
+ if (counter.bytes)
+ return from;
+ from++;
+ }
+
+ return NULL;
+}
+
+/* Called with mod_area_mt locked */
+static void clean_unused_module_areas_locked(void)
+{
+ MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
+ struct module *val;
+
+ mas_for_each(&mas, val, module_tags.size) {
+ if (val != &unloaded_mod)
+ continue;
+
+ /* Release area if all tags are unused */
+ if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index),
+ (struct alloc_tag *)(module_tags.start_addr + mas.last)))
+ mas_erase(&mas);
+ }
+}
+
+/* Called with mod_area_mt locked */
+static bool find_aligned_area(struct ma_state *mas, unsigned long section_size,
+ unsigned long size, unsigned int prepend, unsigned long align)
+{
+ bool cleanup_done = false;
+
+repeat:
+ /* Try finding exact size and hope the start is aligned */
+ if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) {
+ if (IS_ALIGNED(mas->index + prepend, align))
+ return true;
+
+ /* Try finding larger area to align later */
+ mas_reset(mas);
+ if (!mas_empty_area(mas, 0, section_size - 1,
+ size + prepend + align - 1))
+ return true;
+ }
+
+ /* No free area, try cleanup stale data and repeat the search once */
+ if (!cleanup_done) {
+ clean_unused_module_areas_locked();
+ cleanup_done = true;
+ mas_reset(mas);
+ goto repeat;
+ }
+
+ return false;
+}
+
+static int vm_module_tags_populate(void)
+{
+ unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT;
+
+ if (phys_size < module_tags.size) {
+ struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages;
+ unsigned long addr = module_tags.start_addr + phys_size;
+ unsigned long more_pages;
+ unsigned long nr;
+
+ more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT;
+ nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN,
+ NUMA_NO_NODE, more_pages, next_page);
+ if (nr < more_pages ||
+ vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL,
+ next_page, PAGE_SHIFT) < 0) {
+ /* Clean up and error out */
+ for (int i = 0; i < nr; i++)
+ __free_page(next_page[i]);
+ return -ENOMEM;
+ }
+ vm_module_tags->nr_pages += nr;
+ }
+
+ return 0;
+}
+
+static void *reserve_module_tags(struct module *mod, unsigned long size,
+ unsigned int prepend, unsigned long align)
+{
+ unsigned long section_size = module_tags.end_addr - module_tags.start_addr;
+ MA_STATE(mas, &mod_area_mt, 0, section_size - 1);
+ unsigned long offset;
+ void *ret = NULL;
+
+ /* If no tags return error */
+ if (size < sizeof(struct alloc_tag))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * align is always power of 2, so we can use IS_ALIGNED and ALIGN.
+ * align 0 or 1 means no alignment, to simplify set to 1.
+ */
+ if (!align)
+ align = 1;
+
+ if (!ensure_alignment(align, &prepend)) {
+ shutdown_mem_profiling(true);
+ pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n",
+ mod->name, align);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mas_lock(&mas);
+ if (!find_aligned_area(&mas, section_size, size, prepend, align)) {
+ ret = ERR_PTR(-ENOMEM);
+ goto unlock;
+ }
+
+ /* Mark found area as reserved */
+ offset = mas.index;
+ offset += prepend;
+ offset = ALIGN(offset, align);
+ if (offset != mas.index) {
+ unsigned long pad_start = mas.index;
+
+ mas.last = offset - 1;
+ mas_store(&mas, &prepend_mod);
+ if (mas_is_err(&mas)) {
+ ret = ERR_PTR(xa_err(mas.node));
+ goto unlock;
+ }
+ mas.index = offset;
+ mas.last = offset + size - 1;
+ mas_store(&mas, mod);
+ if (mas_is_err(&mas)) {
+ mas.index = pad_start;
+ mas_erase(&mas);
+ ret = ERR_PTR(xa_err(mas.node));
+ }
+ } else {
+ mas.last = offset + size - 1;
+ mas_store(&mas, mod);
+ if (mas_is_err(&mas))
+ ret = ERR_PTR(xa_err(mas.node));
+ }
+unlock:
+ mas_unlock(&mas);
+
+ if (IS_ERR(ret))
+ return ret;
+
+ if (module_tags.size < offset + size) {
+ int grow_res;
+
+ module_tags.size = offset + size;
+ if (mem_alloc_profiling_enabled() && !tags_addressable()) {
+ shutdown_mem_profiling(true);
+ pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n",
+ mod->name, NR_UNUSED_PAGEFLAG_BITS);
+ }
+
+ grow_res = vm_module_tags_populate();
+ if (grow_res) {
+ shutdown_mem_profiling(true);
+ pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n",
+ mod->name);
+ return ERR_PTR(grow_res);
+ }
+ }
+
+ return (struct alloc_tag *)(module_tags.start_addr + offset);
+}
+
+static void release_module_tags(struct module *mod, bool used)
+{
+ MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size);
struct alloc_tag *tag;
- struct codetag *ct;
+ struct module *val;
+
+ mas_lock(&mas);
+ mas_for_each_rev(&mas, val, 0)
+ if (val == mod)
+ break;
+
+ if (!val) /* module not found */
+ goto out;
+
+ if (!used)
+ goto release_area;
+
+ /* Find out if the area is used */
+ tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index),
+ (struct alloc_tag *)(module_tags.start_addr + mas.last));
+ if (tag) {
+ struct alloc_tag_counters counter = alloc_tag_read(tag);
+
+ pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n",
+ tag->ct.filename, tag->ct.lineno, tag->ct.modname,
+ tag->ct.function, counter.bytes);
+ } else {
+ used = false;
+ }
+release_area:
+ mas_store(&mas, used ? &unloaded_mod : NULL);
+ val = mas_prev_range(&mas, 0);
+ if (val == &prepend_mod)
+ mas_store(&mas, NULL);
+out:
+ mas_unlock(&mas);
+}
- for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
- if (iter.cmod != cmod)
+static void replace_module(struct module *mod, struct module *new_mod)
+{
+ MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
+ struct module *val;
+
+ mas_lock(&mas);
+ mas_for_each(&mas, val, module_tags.size) {
+ if (val != mod)
continue;
- tag = ct_to_alloc_tag(ct);
- counter = alloc_tag_read(tag);
+ mas_store_gfp(&mas, new_mod, GFP_KERNEL);
+ break;
+ }
+ mas_unlock(&mas);
+}
- if (WARN(counter.bytes,
- "%s:%u module %s func:%s has %llu allocated at module unload",
- ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
- module_unused = false;
+static int __init alloc_mod_tags_mem(void)
+{
+ /* Map space to copy allocation tags */
+ vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE);
+ if (!vm_module_tags) {
+ pr_err("Failed to map %lu bytes for module allocation tags\n",
+ MODULE_ALLOC_TAG_VMAP_SIZE);
+ module_tags.start_addr = 0;
+ return -ENOMEM;
}
- return module_unused;
+ vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT,
+ sizeof(struct page *), GFP_KERNEL | __GFP_ZERO);
+ if (!vm_module_tags->pages) {
+ free_vm_area(vm_module_tags);
+ return -ENOMEM;
+ }
+
+ module_tags.start_addr = (unsigned long)vm_module_tags->addr;
+ module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE;
+ /* Ensure the base is alloc_tag aligned when required for indexing */
+ module_tags.start_addr = alloc_tag_align(module_tags.start_addr);
+
+ return 0;
+}
+
+static void __init free_mod_tags_mem(void)
+{
+ int i;
+
+ module_tags.start_addr = 0;
+ for (i = 0; i < vm_module_tags->nr_pages; i++)
+ __free_page(vm_module_tags->pages[i]);
+ kfree(vm_module_tags->pages);
+ free_vm_area(vm_module_tags);
}
-#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
-static bool mem_profiling_support __meminitdata = true;
-#else
-static bool mem_profiling_support __meminitdata;
-#endif
+#else /* CONFIG_MODULES */
+
+static inline int alloc_mod_tags_mem(void) { return 0; }
+static inline void free_mod_tags_mem(void) {}
+
+#endif /* CONFIG_MODULES */
+/* See: Documentation/mm/allocation-profiling.rst */
static int __init setup_early_mem_profiling(char *str)
{
+ bool compressed = false;
bool enable;
if (!str || !str[0])
@@ -190,22 +615,37 @@ static int __init setup_early_mem_profiling(char *str)
if (!strncmp(str, "never", 5)) {
enable = false;
mem_profiling_support = false;
+ pr_info("Memory allocation profiling is disabled!\n");
} else {
- int res;
+ char *token = strsep(&str, ",");
- res = kstrtobool(str, &enable);
- if (res)
- return res;
+ if (kstrtobool(token, &enable))
+ return -EINVAL;
+ if (str) {
+
+ if (strcmp(str, "compressed"))
+ return -EINVAL;
+
+ compressed = true;
+ }
mem_profiling_support = true;
+ pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n",
+ compressed ? "with" : "without", enable ? "on" : "off");
}
- if (enable != static_key_enabled(&mem_alloc_profiling_key)) {
+ if (enable != mem_alloc_profiling_enabled()) {
if (enable)
static_branch_enable(&mem_alloc_profiling_key);
else
static_branch_disable(&mem_alloc_profiling_key);
}
+ if (compressed != static_key_enabled(&mem_profiling_compressed)) {
+ if (compressed)
+ static_branch_enable(&mem_profiling_compressed);
+ else
+ static_branch_disable(&mem_profiling_compressed);
+ }
return 0;
}
@@ -213,6 +653,9 @@ early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling);
static __init bool need_page_alloc_tagging(void)
{
+ if (static_key_enabled(&mem_profiling_compressed))
+ return false;
+
return mem_profiling_support;
}
@@ -255,14 +698,26 @@ static inline void sysctl_init(void) {}
static int __init alloc_tag_init(void)
{
const struct codetag_type_desc desc = {
- .section = "alloc_tags",
- .tag_size = sizeof(struct alloc_tag),
- .module_unload = alloc_tag_module_unload,
+ .section = ALLOC_TAG_SECTION_NAME,
+ .tag_size = sizeof(struct alloc_tag),
+#ifdef CONFIG_MODULES
+ .needs_section_mem = needs_section_mem,
+ .alloc_section_mem = reserve_module_tags,
+ .free_section_mem = release_module_tags,
+ .module_replaced = replace_module,
+#endif
};
+ int res;
+
+ res = alloc_mod_tags_mem();
+ if (res)
+ return res;
alloc_tag_cttype = codetag_register_type(&desc);
- if (IS_ERR(alloc_tag_cttype))
+ if (IS_ERR(alloc_tag_cttype)) {
+ free_mod_tags_mem();
return PTR_ERR(alloc_tag_cttype);
+ }
sysctl_init();
procfs_init();
diff --git a/lib/codetag.c b/lib/codetag.c
index d1fbbb7c2ec3..42aadd6c1454 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -149,8 +149,8 @@ static struct codetag_range get_section_range(struct module *mod,
const char *section)
{
return (struct codetag_range) {
- get_symbol(mod, "__start_", section),
- get_symbol(mod, "__stop_", section),
+ get_symbol(mod, CODETAG_SECTION_START_PREFIX, section),
+ get_symbol(mod, CODETAG_SECTION_STOP_PREFIX, section),
};
}
@@ -207,6 +207,94 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
}
#ifdef CONFIG_MODULES
+#define CODETAG_SECTION_PREFIX ".codetag."
+
+/* Some codetag types need a separate module section */
+bool codetag_needs_module_section(struct module *mod, const char *name,
+ unsigned long size)
+{
+ const char *type_name;
+ struct codetag_type *cttype;
+ bool ret = false;
+
+ if (strncmp(name, CODETAG_SECTION_PREFIX, strlen(CODETAG_SECTION_PREFIX)))
+ return false;
+
+ type_name = name + strlen(CODETAG_SECTION_PREFIX);
+ mutex_lock(&codetag_lock);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ if (strcmp(type_name, cttype->desc.section) == 0) {
+ if (!cttype->desc.needs_section_mem)
+ break;
+
+ down_write(&cttype->mod_lock);
+ ret = cttype->desc.needs_section_mem(mod, size);
+ up_write(&cttype->mod_lock);
+ break;
+ }
+ }
+ mutex_unlock(&codetag_lock);
+
+ return ret;
+}
+
+void *codetag_alloc_module_section(struct module *mod, const char *name,
+ unsigned long size, unsigned int prepend,
+ unsigned long align)
+{
+ const char *type_name = name + strlen(CODETAG_SECTION_PREFIX);
+ struct codetag_type *cttype;
+ void *ret = ERR_PTR(-EINVAL);
+
+ mutex_lock(&codetag_lock);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ if (strcmp(type_name, cttype->desc.section) == 0) {
+ if (WARN_ON(!cttype->desc.alloc_section_mem))
+ break;
+
+ down_write(&cttype->mod_lock);
+ ret = cttype->desc.alloc_section_mem(mod, size, prepend, align);
+ up_write(&cttype->mod_lock);
+ break;
+ }
+ }
+ mutex_unlock(&codetag_lock);
+
+ return ret;
+}
+
+void codetag_free_module_sections(struct module *mod)
+{
+ struct codetag_type *cttype;
+
+ mutex_lock(&codetag_lock);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ if (!cttype->desc.free_section_mem)
+ continue;
+
+ down_write(&cttype->mod_lock);
+ cttype->desc.free_section_mem(mod, false);
+ up_write(&cttype->mod_lock);
+ }
+ mutex_unlock(&codetag_lock);
+}
+
+void codetag_module_replaced(struct module *mod, struct module *new_mod)
+{
+ struct codetag_type *cttype;
+
+ mutex_lock(&codetag_lock);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ if (!cttype->desc.module_replaced)
+ continue;
+
+ down_write(&cttype->mod_lock);
+ cttype->desc.module_replaced(mod, new_mod);
+ up_write(&cttype->mod_lock);
+ }
+ mutex_unlock(&codetag_lock);
+}
+
void codetag_load_module(struct module *mod)
{
struct codetag_type *cttype;
@@ -220,13 +308,12 @@ void codetag_load_module(struct module *mod)
mutex_unlock(&codetag_lock);
}
-bool codetag_unload_module(struct module *mod)
+void codetag_unload_module(struct module *mod)
{
struct codetag_type *cttype;
- bool unload_ok = true;
if (!mod)
- return true;
+ return;
/* await any module's kfree_rcu() operations to complete */
kvfree_rcu_barrier();
@@ -246,18 +333,17 @@ bool codetag_unload_module(struct module *mod)
}
if (found) {
if (cttype->desc.module_unload)
- if (!cttype->desc.module_unload(cttype, cmod))
- unload_ok = false;
+ cttype->desc.module_unload(cttype, cmod);
cttype->count -= range_size(cttype, &cmod->range);
idr_remove(&cttype->mod_idr, mod_id);
kfree(cmod);
}
up_write(&cttype->mod_lock);
+ if (found && cttype->desc.free_section_mem)
+ cttype->desc.free_section_mem(mod, true);
}
mutex_unlock(&codetag_lock);
-
- return unload_ok;
}
#endif /* CONFIG_MODULES */
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 3619301dda2e..d0ae808f3a14 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -64,6 +64,21 @@
#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>
+/*
+ * Kernel pointer hashing renders much of the maple tree dump useless as tagged
+ * pointers get hashed to arbitrary values.
+ *
+ * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is
+ * permissible to bypass this. Otherwise remain cautious and retain the hashing.
+ *
+ * Userland doesn't know about %px so also use %p there.
+ */
+#if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+#define PTR_FMT "%px"
+#else
+#define PTR_FMT "%p"
+#endif
+
#define MA_ROOT_PARENT 1
/*
@@ -120,7 +135,6 @@ static const unsigned char mt_min_slots[] = {
#define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1)
struct maple_big_node {
- struct maple_pnode *parent;
unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1];
union {
struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS];
@@ -1193,19 +1207,17 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
reuse->request_count = 0;
reuse->node_count = 0;
- if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
- head->slot[head->node_count++] = reuse;
- head->total++;
- goto done;
- }
-
- reuse->total = 1;
- if ((head) && !((unsigned long)head & 0x1)) {
+ if (count) {
+ if (head->node_count < MAPLE_ALLOC_SLOTS) {
+ head->slot[head->node_count++] = reuse;
+ head->total++;
+ goto done;
+ }
reuse->slot[0] = head;
reuse->node_count = 1;
- reuse->total += head->total;
}
+ reuse->total = count + 1;
mas->alloc = reuse;
done:
if (requested > 1)
@@ -1251,11 +1263,11 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
mas->alloc = node;
node->total = ++allocated;
+ node->request_count = 0;
requested--;
}
node = mas->alloc;
- node->request_count = 0;
while (requested) {
max_req = MAPLE_ALLOC_SLOTS - node->node_count;
slots = (void **)&node->slot[node->node_count];
@@ -1271,7 +1283,10 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
node->node_count += count;
allocated += count;
- node = node->slot[0];
+ /* find a non-full node*/
+ do {
+ node = node->slot[0];
+ } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS));
requested -= count;
}
mas->alloc->total = allocated;
@@ -1280,10 +1295,9 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
nomem_bulk:
/* Clean up potential freed allocations on bulk failure */
memset(slots, 0, max_req * sizeof(unsigned long));
+ mas->alloc->total = allocated;
nomem_one:
mas_set_alloc_req(mas, requested);
- if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
- mas->alloc->total = allocated;
mas_set_err(mas, -ENOMEM);
}
@@ -1943,14 +1957,13 @@ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start,
for (; i < piv_end; i++, j++) {
b_node->pivot[j] = pivots[i];
if (unlikely(!b_node->pivot[j]))
- break;
+ goto complete;
if (unlikely(mas->max == b_node->pivot[j]))
goto complete;
}
- if (likely(i <= mas_end))
- b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt);
+ b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt);
complete:
b_node->b_end = ++j;
@@ -2139,9 +2152,7 @@ static inline bool mas_prev_sibling(struct ma_state *mas)
{
unsigned int p_slot = mte_parent_slot(mas->node);
- if (mte_is_root(mas->node))
- return false;
-
+ /* For root node, p_slot is set to 0 by mte_parent_slot(). */
if (!p_slot)
return false;
@@ -3159,10 +3170,7 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast,
bool cp = true;
unsigned char split;
- memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap));
- memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot));
- memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot));
- mast->bn->b_end = 0;
+ memset(mast->bn, 0, sizeof(struct maple_big_node));
if (mte_is_root(mas->node)) {
cp = false;
@@ -3400,7 +3408,7 @@ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas,
* @mas: The maple state
* @entry: The entry to store into the tree
*/
-static inline int mas_root_expand(struct ma_state *mas, void *entry)
+static inline void mas_root_expand(struct ma_state *mas, void *entry)
{
void *contents = mas_root_locked(mas);
enum maple_type type = maple_leaf_64;
@@ -3436,12 +3444,23 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry)
ma_set_meta(node, maple_leaf_64, 0, slot);
/* swap the new root into the tree */
rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
- return slot;
+ return;
}
+/*
+ * mas_store_root() - Storing value into root.
+ * @mas: The maple state
+ * @entry: The entry to store.
+ *
+ * There is no root node now and we are storing a value into the root - this
+ * function either assigns the pointer or expands into a node.
+ */
static inline void mas_store_root(struct ma_state *mas, void *entry)
{
- if (likely((mas->last != 0) || (mas->index != 0)))
+ if (!entry) {
+ if (!mas->index)
+ rcu_assign_pointer(mas->tree->ma_root, NULL);
+ } else if (likely((mas->last != 0) || (mas->index != 0)))
mas_root_expand(mas, entry);
else if (((unsigned long) (entry) & 3) == 2)
mas_root_expand(mas, entry);
@@ -3662,7 +3681,9 @@ static inline void mas_new_root(struct ma_state *mas, void *entry)
void __rcu **slots;
unsigned long *pivots;
- if (!entry && !mas->index && mas->last == ULONG_MAX) {
+ WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX);
+
+ if (!entry) {
mas->depth = 0;
mas_set_height(mas);
rcu_assign_pointer(mas->tree->ma_root, entry);
@@ -3889,7 +3910,8 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
wr_mas->pivots[offset] = mas->index - 1;
mas->offset++; /* Keep mas accurate. */
}
- } else if (!mt_in_rcu(mas->tree)) {
+ } else {
+ WARN_ON_ONCE(mt_in_rcu(mas->tree));
/*
* Expand the range, only partially overwriting the previous and
* next ranges
@@ -3899,8 +3921,6 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
wr_mas->pivots[offset] = mas->index - 1;
wr_mas->pivots[offset + 1] = mas->last;
mas->offset++; /* Keep mas accurate. */
- } else {
- return;
}
trace_ma_write(__func__, mas, 0, wr_mas->entry);
@@ -4181,75 +4201,53 @@ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry)
}
/*
- * mas_wr_store_type() - Set the store type for a given
+ * mas_wr_store_type() - Determine the store type for a given
* store operation.
* @wr_mas: The maple write state
+ *
+ * Return: the type of store needed for the operation
*/
-static inline void mas_wr_store_type(struct ma_wr_state *wr_mas)
+static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
unsigned char new_end;
- if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) {
- mas->store_type = wr_store_root;
- return;
- }
+ if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
+ return wr_store_root;
- if (unlikely(!mas_wr_walk(wr_mas))) {
- mas->store_type = wr_spanning_store;
- return;
- }
+ if (unlikely(!mas_wr_walk(wr_mas)))
+ return wr_spanning_store;
/* At this point, we are at the leaf node that needs to be altered. */
mas_wr_end_piv(wr_mas);
if (!wr_mas->entry)
mas_wr_extend_null(wr_mas);
- new_end = mas_wr_new_end(wr_mas);
- if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) {
- mas->store_type = wr_exact_fit;
- return;
- }
+ if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last))
+ return wr_exact_fit;
- if (unlikely(!mas->index && mas->last == ULONG_MAX)) {
- mas->store_type = wr_new_root;
- return;
- }
+ if (unlikely(!mas->index && mas->last == ULONG_MAX))
+ return wr_new_root;
+ new_end = mas_wr_new_end(wr_mas);
/* Potential spanning rebalance collapsing a node */
if (new_end < mt_min_slots[wr_mas->type]) {
- if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) {
- mas->store_type = wr_rebalance;
- return;
- }
- mas->store_type = wr_node_store;
- return;
+ if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK))
+ return wr_rebalance;
+ return wr_node_store;
}
- if (new_end >= mt_slots[wr_mas->type]) {
- mas->store_type = wr_split_store;
- return;
- }
+ if (new_end >= mt_slots[wr_mas->type])
+ return wr_split_store;
- if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) {
- mas->store_type = wr_append;
- return;
- }
+ if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end))
+ return wr_append;
if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) ||
- (wr_mas->offset_end - mas->offset == 1))) {
- mas->store_type = wr_slot_store;
- return;
- }
-
- if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) ||
- (mas->mas_flags & MA_STATE_BULK)) {
- mas->store_type = wr_node_store;
- return;
- }
+ (wr_mas->offset_end - mas->offset == 1)))
+ return wr_slot_store;
- mas->store_type = wr_invalid;
- MAS_WARN_ON(mas, 1);
+ return wr_node_store;
}
/**
@@ -4264,7 +4262,7 @@ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry)
int request;
mas_wr_prealloc_setup(wr_mas);
- mas_wr_store_type(wr_mas);
+ mas->store_type = mas_wr_store_type(wr_mas);
request = mas_prealloc_calc(mas, entry);
if (!request)
return;
@@ -5419,7 +5417,8 @@ void *mas_store(struct ma_state *mas, void *entry)
trace_ma_write(__func__, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
if (MAS_WARN_ON(mas, mas->index > mas->last))
- pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry);
+ pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
+ entry);
if (mas->index > mas->last) {
mas_set_err(mas, -EINVAL);
@@ -5435,7 +5434,7 @@ void *mas_store(struct ma_state *mas, void *entry)
* overwrite multiple entries within a self-balancing B-Tree.
*/
mas_wr_prealloc_setup(&wr_mas);
- mas_wr_store_type(&wr_mas);
+ mas->store_type = mas_wr_store_type(&wr_mas);
if (mas->mas_flags & MA_STATE_PREALLOC) {
mas_wr_store_entry(&wr_mas);
MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
@@ -5538,7 +5537,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
int request;
mas_wr_prealloc_setup(&wr_mas);
- mas_wr_store_type(&wr_mas);
+ mas->store_type = mas_wr_store_type(&wr_mas);
request = mas_prealloc_calc(mas, entry);
if (!request)
return ret;
@@ -7124,14 +7123,14 @@ static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
mt_dump_range(min, max, depth, format);
if (xa_is_value(entry))
- pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry),
- xa_to_value(entry), entry);
+ pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry),
+ xa_to_value(entry), entry);
else if (xa_is_zero(entry))
pr_cont("zero (%ld)\n", xa_to_internal(entry));
else if (mt_is_reserved(entry))
- pr_cont("UNKNOWN ENTRY (%p)\n", entry);
+ pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry);
else
- pr_cont("%p\n", entry);
+ pr_cont(PTR_FMT "\n", entry);
}
static void mt_dump_range64(const struct maple_tree *mt, void *entry,
@@ -7147,13 +7146,13 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
switch(format) {
case mt_dump_hex:
- pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
+ pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
break;
case mt_dump_dec:
- pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
+ pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
}
}
- pr_cont("%p\n", node->slot[i]);
+ pr_cont(PTR_FMT "\n", node->slot[i]);
for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
unsigned long last = max;
@@ -7175,11 +7174,11 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
if (last > max) {
switch(format) {
case mt_dump_hex:
- pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n",
+ pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
node, last, max, i);
break;
case mt_dump_dec:
- pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
+ pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
node, last, max, i);
}
}
@@ -7209,13 +7208,13 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
switch (format) {
case mt_dump_hex:
- pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
+ pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
break;
case mt_dump_dec:
- pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
+ pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
}
}
- pr_cont("%p\n", node->slot[i]);
+ pr_cont(PTR_FMT "\n", node->slot[i]);
for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
unsigned long last = max;
@@ -7234,11 +7233,11 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
if (last > max) {
switch(format) {
case mt_dump_hex:
- pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n",
+ pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
node, last, max, i);
break;
case mt_dump_dec:
- pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
+ pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
node, last, max, i);
}
}
@@ -7256,8 +7255,8 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry,
mt_dump_range(min, max, depth, format);
- pr_cont("node %p depth %d type %d parent %p", node, depth, type,
- node ? node->parent : NULL);
+ pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node,
+ depth, type, node ? node->parent : NULL);
switch (type) {
case maple_dense:
pr_cont("\n");
@@ -7285,12 +7284,14 @@ void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));
- pr_info("maple_tree(%p) flags %X, height %u root %p\n",
+ pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n",
mt, mt->ma_flags, mt_height(mt), entry);
- if (!xa_is_node(entry))
- mt_dump_entry(entry, 0, 0, 0, format);
- else if (entry)
+ if (xa_is_node(entry))
mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
+ else if (entry)
+ mt_dump_entry(entry, 0, 0, 0, format);
+ else
+ pr_info("(empty)\n");
}
EXPORT_SYMBOL_GPL(mt_dump);
@@ -7337,7 +7338,7 @@ static void mas_validate_gaps(struct ma_state *mas)
MT_BUG_ON(mas->tree, !entry);
if (gap > p_end - p_start + 1) {
- pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n",
+ pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n",
mas_mn(mas), i, gap, p_end, p_start,
p_end - p_start + 1);
MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
@@ -7357,19 +7358,19 @@ counted:
MT_BUG_ON(mas->tree, !gaps);
offset = ma_meta_gap(node);
if (offset > i) {
- pr_err("gap offset %p[%u] is invalid\n", node, offset);
+ pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset);
MT_BUG_ON(mas->tree, 1);
}
if (gaps[offset] != max_gap) {
- pr_err("gap %p[%u] is not the largest gap %lu\n",
+ pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n",
node, offset, max_gap);
MT_BUG_ON(mas->tree, 1);
}
for (i++ ; i < mt_slot_count(mte); i++) {
if (gaps[i] != 0) {
- pr_err("gap %p[%u] beyond node limit != 0\n",
+ pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n",
node, i);
MT_BUG_ON(mas->tree, 1);
}
@@ -7383,7 +7384,7 @@ counted:
p_mn = mte_parent(mte);
MT_BUG_ON(mas->tree, max_gap > mas->max);
if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
- pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap);
+ pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap);
mt_dump(mas->tree, mt_dump_hex);
MT_BUG_ON(mas->tree, 1);
}
@@ -7413,11 +7414,11 @@ static void mas_validate_parent_slot(struct ma_state *mas)
node = mas_slot(mas, slots, i);
if (i == p_slot) {
if (node != mas->node)
- pr_err("parent %p[%u] does not have %p\n",
+ pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n",
parent, i, mas_mn(mas));
MT_BUG_ON(mas->tree, node != mas->node);
} else if (node == mas->node) {
- pr_err("Invalid child %p at parent %p[%u] p_slot %u\n",
+ pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n",
mas_mn(mas), parent, i, p_slot);
MT_BUG_ON(mas->tree, node == mas->node);
}
@@ -7439,20 +7440,20 @@ static void mas_validate_child_slot(struct ma_state *mas)
child = mas_slot(mas, slots, i);
if (!child) {
- pr_err("Non-leaf node lacks child at %p[%u]\n",
+ pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n",
mas_mn(mas), i);
MT_BUG_ON(mas->tree, 1);
}
if (mte_parent_slot(child) != i) {
- pr_err("Slot error at %p[%u]: child %p has pslot %u\n",
+ pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n",
mas_mn(mas), i, mte_to_node(child),
mte_parent_slot(child));
MT_BUG_ON(mas->tree, 1);
}
if (mte_parent(child) != mte_to_node(mas->node)) {
- pr_err("child %p has parent %p not %p\n",
+ pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n",
mte_to_node(child), mte_parent(child),
mte_to_node(mas->node));
MT_BUG_ON(mas->tree, 1);
@@ -7482,24 +7483,24 @@ static void mas_validate_limits(struct ma_state *mas)
piv = mas_safe_pivot(mas, pivots, i, type);
if (!piv && (i != 0)) {
- pr_err("Missing node limit pivot at %p[%u]",
+ pr_err("Missing node limit pivot at " PTR_FMT "[%u]",
mas_mn(mas), i);
MAS_WARN_ON(mas, 1);
}
if (prev_piv > piv) {
- pr_err("%p[%u] piv %lu < prev_piv %lu\n",
+ pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n",
mas_mn(mas), i, piv, prev_piv);
MAS_WARN_ON(mas, piv < prev_piv);
}
if (piv < mas->min) {
- pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i,
+ pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i,
piv, mas->min);
MAS_WARN_ON(mas, piv < mas->min);
}
if (piv > mas->max) {
- pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i,
+ pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i,
piv, mas->max);
MAS_WARN_ON(mas, piv > mas->max);
}
@@ -7509,7 +7510,7 @@ static void mas_validate_limits(struct ma_state *mas)
}
if (mas_data_end(mas) != i) {
- pr_err("node%p: data_end %u != the last slot offset %u\n",
+ pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n",
mas_mn(mas), mas_data_end(mas), i);
MT_BUG_ON(mas->tree, 1);
}
@@ -7518,8 +7519,8 @@ static void mas_validate_limits(struct ma_state *mas)
void *entry = mas_slot(mas, slots, i);
if (entry && (i != mt_slots[type] - 1)) {
- pr_err("%p[%u] should not have entry %p\n", mas_mn(mas),
- i, entry);
+ pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n",
+ mas_mn(mas), i, entry);
MT_BUG_ON(mas->tree, entry != NULL);
}
@@ -7529,7 +7530,7 @@ static void mas_validate_limits(struct ma_state *mas)
if (!piv)
continue;
- pr_err("%p[%u] should not have piv %lu\n",
+ pr_err(PTR_FMT "[%u] should not have piv %lu\n",
mas_mn(mas), i, piv);
MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
}
@@ -7554,7 +7555,7 @@ static void mt_validate_nulls(struct maple_tree *mt)
do {
entry = mas_slot(&mas, slots, offset);
if (!last && !entry) {
- pr_err("Sequential nulls end at %p[%u]\n",
+ pr_err("Sequential nulls end at " PTR_FMT "[%u]\n",
mas_mn(&mas), offset);
}
MT_BUG_ON(mt, !last && !entry);
@@ -7596,7 +7597,8 @@ void mt_validate(struct maple_tree *mt)
end = mas_data_end(&mas);
if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
(mas.max != ULONG_MAX))) {
- pr_err("Invalid size %u of %p\n", end, mas_mn(&mas));
+ pr_err("Invalid size %u of " PTR_FMT "\n",
+ end, mas_mn(&mas));
}
mas_validate_parent_slot(&mas);
@@ -7612,7 +7614,8 @@ EXPORT_SYMBOL_GPL(mt_validate);
void mas_dump(const struct ma_state *mas)
{
- pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node);
+ pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ",
+ mas->tree, mas->node);
switch (mas->status) {
case ma_active:
pr_err("(ma_active)");
@@ -7676,7 +7679,7 @@ void mas_dump(const struct ma_state *mas)
pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
mas->index, mas->last);
- pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n",
+ pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n",
mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
if (mas->index > mas->last)
pr_err("Check index & last\n");
@@ -7685,7 +7688,7 @@ EXPORT_SYMBOL_GPL(mas_dump);
void mas_wr_dump(const struct ma_wr_state *wr_mas)
{
- pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n",
+ pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n",
wr_mas->node, wr_mas->r_min, wr_mas->r_max);
pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
diff --git a/lib/percpu_test.c b/lib/percpu_test.c
index 4a3d70bbc1a0..ce7124b16dab 100644
--- a/lib/percpu_test.c
+++ b/lib/percpu_test.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/limits.h>
#include <linux/module.h>
/* validate @native and @pcp counter values match @expected */
@@ -24,8 +25,9 @@ static int __init percpu_test_init(void)
* +ul_one/-ul_one below would replace with inc/dec instructions.
*/
volatile unsigned int ui_one = 1;
- long l = 0;
+ unsigned long long ull = 0;
unsigned long ul = 0;
+ long l = 0;
pr_info("percpu test start\n");
@@ -112,6 +114,13 @@ static int __init percpu_test_init(void)
CHECK(ul, ulong_counter, -1);
CHECK(ul, ulong_counter, ULONG_MAX);
+ ul = ull = 0;
+ __this_cpu_write(ulong_counter, 0);
+
+ ul = ull += UINT_MAX;
+ __this_cpu_add(ulong_counter, ull);
+ CHECK(ul, ulong_counter, UINT_MAX);
+
ul = 3;
__this_cpu_write(ulong_counter, 3);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 989a12a67872..6dc234913dd5 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -120,6 +120,9 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
if (unlikely(count <= 0))
return 0;
+ kasan_check_write(dst, count);
+ check_object_size(dst, count, false);
+
if (can_do_masked_user_access()) {
long retval;
@@ -142,8 +145,6 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
if (max > count)
max = count;
- kasan_check_write(dst, count);
- check_object_size(dst, count, false);
if (user_read_access_begin(src, max)) {
retval = do_strncpy_from_user(dst, src, count, max);
user_read_access_end();
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index 31561e0e1a0d..704cb1093ae8 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -1387,6 +1387,92 @@ static noinline void __init check_prev_entry(struct maple_tree *mt)
mas_unlock(&mas);
}
+static noinline void __init check_store_null(struct maple_tree *mt)
+{
+ MA_STATE(mas, mt, 0, ULONG_MAX);
+
+ /*
+ * Store NULL at range [0, ULONG_MAX] to an empty tree should result
+ * in an empty tree
+ */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ MT_BUG_ON(mt, !mtree_empty(mt));
+ mas_unlock(&mas);
+ mtree_destroy(mt);
+
+ /*
+ * Store NULL at any range to an empty tree should result in an empty
+ * tree
+ */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ mas_set_range(&mas, 3, 10);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ MT_BUG_ON(mt, !mtree_empty(mt));
+ mas_unlock(&mas);
+ mtree_destroy(mt);
+
+ /*
+ * Store NULL at range [0, ULONG_MAX] to a single entry tree should
+ * result in an empty tree
+ */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ mas_set(&mas, 0);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+ mas_set_range(&mas, 0, ULONG_MAX);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ MT_BUG_ON(mt, !mtree_empty(mt));
+ mas_unlock(&mas);
+ mtree_destroy(mt);
+
+ /*
+ * Store NULL at range [0, n] to a single entry tree should
+ * result in an empty tree
+ */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ mas_set(&mas, 0);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+ mas_set_range(&mas, 0, 5);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ MT_BUG_ON(mt, !mtree_empty(mt));
+ mas_unlock(&mas);
+ mtree_destroy(mt);
+
+ /*
+ * Store NULL at range [m, n] where m > 0 to a single entry tree
+ * should still be a single entry tree
+ */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ mas_set(&mas, 0);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+ mas_set_range(&mas, 2, 5);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ MT_BUG_ON(mt, mtree_empty(mt));
+// MT_BUG_ON(mt, xa_is_node(mas_root(&mas)));
+ mas_unlock(&mas);
+ mtree_destroy(mt);
+
+ /*
+ * Store NULL at range [0, ULONG_MAX] to a tree with node should
+ * result in an empty tree
+ */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ mas_set_range(&mas, 1, 3);
+ mas_store_gfp(&mas, &mas, GFP_KERNEL);
+// MT_BUG_ON(mt, !xa_is_node(mas_root(&mas)));
+ mas_set_range(&mas, 0, ULONG_MAX);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ MT_BUG_ON(mt, !mtree_empty(mt));
+ mas_unlock(&mas);
+ mtree_destroy(mt);
+}
+
static noinline void __init check_root_expand(struct maple_tree *mt)
{
MA_STATE(mas, mt, 0, 0);
@@ -3711,6 +3797,10 @@ static int __init maple_tree_seed(void)
#endif
mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+ check_store_null(&tree);
+ mtree_destroy(&tree);
+
+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
check_root_expand(&tree);
mtree_destroy(&tree);
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index fa7cb0c87c03..95f288169a38 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -14,23 +14,24 @@
#include <linux/memory_hotplug.h>
#include <linux/kmemleak.h>
-void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+void get_page_bootmem(unsigned long info, struct page *page,
+ enum bootmem_type type)
{
- page->index = type;
+ BUG_ON(type > 0xf);
+ BUG_ON(info > (ULONG_MAX >> 4));
SetPagePrivate(page);
- set_page_private(page, info);
+ set_page_private(page, info << 4 | type);
page_ref_inc(page);
}
void put_page_bootmem(struct page *page)
{
- unsigned long type = page->index;
+ enum bootmem_type type = bootmem_type(page);
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
- page->index = 0;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
diff --git a/mm/cma.c b/mm/cma.c
index 2d9fae939283..de5bc0c81fc2 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,7 +32,7 @@
#include "cma.h"
struct cma cma_areas[MAX_CMA_AREAS];
-unsigned cma_area_count;
+unsigned int cma_area_count;
static DEFINE_MUTEX(cma_mutex);
phys_addr_t cma_get_base(const struct cma *cma)
@@ -135,7 +135,6 @@ out_error:
totalcma_pages -= cma->count;
cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
- return;
}
static int __init cma_init_reserved_areas(void)
@@ -182,6 +181,15 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
+ /*
+ * CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which
+ * needs pageblock_order to be initialized. Let's enforce it.
+ */
+ if (!pageblock_order) {
+ pr_err("pageblock_order not yet initialized. Called during early boot?\n");
+ return -EINVAL;
+ }
+
/* ensure minimal alignment required by mm core */
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 35b72f88983a..d0357f3e9372 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -60,7 +60,7 @@ config DAMON_SYSFS
the interface for arbitrary data access monitoring.
config DAMON_SYSFS_KUNIT_TEST
- bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
+ bool "Test for damon sysfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_SYSFS && KUNIT=y
default KUNIT_ALL_TESTS
help
diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h
index d2ecfcc8db86..087e53f641a8 100644
--- a/mm/damon/tests/dbgfs-kunit.h
+++ b/mm/damon/tests/dbgfs-kunit.h
@@ -168,6 +168,6 @@ static struct kunit_suite damon_test_suite = {
};
kunit_test_suite(damon_test_suite);
-#endif /* _DAMON_TEST_H */
+#endif /* _DAMON_DBGFS_TEST_H */
#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index a339d117150f..b9fe3bc8472b 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -66,7 +66,7 @@ failed:
static void damon_test_three_regions_in_vmas(struct kunit *test)
{
static struct mm_struct mm;
- struct damon_addr_range regions[3] = {0,};
+ struct damon_addr_range regions[3] = {0};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
struct vm_area_struct vmas[] = {
(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
@@ -300,6 +300,8 @@ static void damon_test_split_evenly(struct kunit *test)
damon_test_split_evenly_fail(test, 0, 100, 0);
damon_test_split_evenly_succ(test, 0, 100, 10);
damon_test_split_evenly_succ(test, 5, 59, 5);
+ damon_test_split_evenly_succ(test, 4, 6, 1);
+ damon_test_split_evenly_succ(test, 0, 3, 2);
damon_test_split_evenly_fail(test, 5, 6, 2);
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 08cfd22b5249..b9eaa20b73b9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -67,10 +67,14 @@ static int damon_va_evenly_split_region(struct damon_target *t,
unsigned long sz_orig, sz_piece, orig_end;
struct damon_region *n = NULL, *next;
unsigned long start;
+ unsigned int i;
if (!r || !nr_pieces)
return -EINVAL;
+ if (nr_pieces == 1)
+ return 0;
+
orig_end = r->ar.end;
sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
@@ -80,8 +84,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
r->ar.end = r->ar.start + sz_piece;
next = damon_next_region(r);
- for (start = r->ar.end; start + sz_piece <= orig_end;
- start += sz_piece) {
+ for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) {
n = damon_new_region(start, start + sz_piece);
if (!n)
return -ENOMEM;
@@ -353,11 +356,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
set_huge_pte_at(mm, addr, pte, entry, psize);
}
-#ifdef CONFIG_MMU_NOTIFIER
if (mmu_notifier_clear_young(mm, addr,
addr + huge_page_size(hstate_vma(vma))))
referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
folio_set_young(folio);
diff --git a/mm/execmem.c b/mm/execmem.c
index 0c4b36bc6d10..317b6a8d35be 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -6,28 +6,41 @@
* Copyright (C) 2024 Mike Rapoport IBM.
*/
+#define pr_fmt(fmt) "execmem: " fmt
+
#include <linux/mm.h>
+#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/execmem.h>
+#include <linux/maple_tree.h>
+#include <linux/set_memory.h>
#include <linux/moduleloader.h>
+#include <linux/text-patching.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
static struct execmem_info *execmem_info __ro_after_init;
static struct execmem_info default_execmem_info __ro_after_init;
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+#ifdef CONFIG_MMU
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+ pgprot_t pgprot, unsigned long vm_flags)
{
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
- unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+ unsigned int align = range->alignment;
unsigned long start = range->start;
unsigned long end = range->end;
- unsigned int align = range->alignment;
- pgprot_t pgprot = range->pgprot;
void *p;
if (kasan)
vm_flags |= VM_DEFER_KMEMLEAK;
+ if (vm_flags & VM_ALLOW_HUGE_VMAP)
+ align = PMD_SIZE;
+
p = __vmalloc_node_range(size, align, start, end, gfp_flags,
pgprot, vm_flags, NUMA_NO_NODE,
__builtin_return_address(0));
@@ -40,7 +53,7 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
}
if (!p) {
- pr_warn_ratelimited("execmem: unable to allocate memory\n");
+ pr_warn_ratelimited("unable to allocate memory\n");
return NULL;
}
@@ -49,14 +62,314 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
return NULL;
}
- return kasan_reset_tag(p);
+ return p;
+}
+
+struct vm_struct *execmem_vmap(size_t size)
+{
+ struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA];
+ struct vm_struct *area;
+
+ area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
+ range->start, range->end, NUMA_NO_NODE,
+ GFP_KERNEL, __builtin_return_address(0));
+ if (!area && range->fallback_start)
+ area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
+ range->fallback_start, range->fallback_end,
+ NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0));
+
+ return area;
+}
+#else
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+ pgprot_t pgprot, unsigned long vm_flags)
+{
+ return vmalloc(size);
+}
+#endif /* CONFIG_MMU */
+
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
+struct execmem_cache {
+ struct mutex mutex;
+ struct maple_tree busy_areas;
+ struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+ .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+ .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+ execmem_cache.mutex),
+ .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+ execmem_cache.mutex),
+};
+
+static inline unsigned long mas_range_len(struct ma_state *mas)
+{
+ return mas->last - mas->index + 1;
+}
+
+static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid)
+{
+ unsigned int nr = (1 << get_vm_area_page_order(vm));
+ unsigned int updated = 0;
+ int err = 0;
+
+ for (int i = 0; i < vm->nr_pages; i += nr) {
+ err = set_direct_map_valid_noflush(vm->pages[i], nr, valid);
+ if (err)
+ goto err_restore;
+ updated += nr;
+ }
+
+ return 0;
+
+err_restore:
+ for (int i = 0; i < updated; i += nr)
+ set_direct_map_valid_noflush(vm->pages[i], nr, !valid);
+
+ return err;
+}
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct mutex *mutex = &execmem_cache.mutex;
+ MA_STATE(mas, free_areas, 0, ULONG_MAX);
+ void *area;
+
+ mutex_lock(mutex);
+ mas_for_each(&mas, area, ULONG_MAX) {
+ size_t size = mas_range_len(&mas);
+
+ if (IS_ALIGNED(size, PMD_SIZE) &&
+ IS_ALIGNED(mas.index, PMD_SIZE)) {
+ struct vm_struct *vm = find_vm_area(area);
+
+ execmem_set_direct_map_valid(vm, true);
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ vfree(area);
+ }
+ }
+ mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static int execmem_cache_add(void *ptr, size_t size)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct mutex *mutex = &execmem_cache.mutex;
+ unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, free_areas, addr - 1, addr + 1);
+ unsigned long lower, upper;
+ void *area = NULL;
+ int err;
+
+ lower = addr;
+ upper = addr + size - 1;
+
+ mutex_lock(mutex);
+ area = mas_walk(&mas);
+ if (area && mas.last == addr - 1)
+ lower = mas.index;
+
+ area = mas_next(&mas, ULONG_MAX);
+ if (area && mas.index == addr + size)
+ upper = mas.last;
+
+ mas_set_range(&mas, lower, upper);
+ err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
+ mutex_unlock(mutex);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static bool within_range(struct execmem_range *range, struct ma_state *mas,
+ size_t size)
+{
+ unsigned long addr = mas->index;
+
+ if (addr >= range->start && addr + size < range->end)
+ return true;
+
+ if (range->fallback_start &&
+ addr >= range->fallback_start && addr + size < range->fallback_end)
+ return true;
+
+ return false;
+}
+
+static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+ MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+ struct mutex *mutex = &execmem_cache.mutex;
+ unsigned long addr, last, area_size = 0;
+ void *area, *ptr = NULL;
+ int err;
+
+ mutex_lock(mutex);
+ mas_for_each(&mas_free, area, ULONG_MAX) {
+ area_size = mas_range_len(&mas_free);
+
+ if (area_size >= size && within_range(range, &mas_free, size))
+ break;
+ }
+
+ if (area_size < size)
+ goto out_unlock;
+
+ addr = mas_free.index;
+ last = mas_free.last;
+
+ /* insert allocated size to busy_areas at range [addr, addr + size) */
+ mas_set_range(&mas_busy, addr, addr + size - 1);
+ err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL);
+ if (err)
+ goto out_unlock;
+
+ mas_store_gfp(&mas_free, NULL, GFP_KERNEL);
+ if (area_size > size) {
+ void *ptr = (void *)(addr + size);
+
+ /*
+ * re-insert remaining free size to free_areas at range
+ * [addr + size, last]
+ */
+ mas_set_range(&mas_free, addr + size, last);
+ err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL);
+ if (err) {
+ mas_store_gfp(&mas_busy, NULL, GFP_KERNEL);
+ goto out_unlock;
+ }
+ }
+ ptr = (void *)addr;
+
+out_unlock:
+ mutex_unlock(mutex);
+ return ptr;
+}
+
+static int execmem_cache_populate(struct execmem_range *range, size_t size)
+{
+ unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
+ unsigned long start, end;
+ struct vm_struct *vm;
+ size_t alloc_size;
+ int err = -ENOMEM;
+ void *p;
+
+ alloc_size = round_up(size, PMD_SIZE);
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p)
+ return err;
+
+ vm = find_vm_area(p);
+ if (!vm)
+ goto err_free_mem;
+
+ /* fill memory with instructions that will trap */
+ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+
+ start = (unsigned long)p;
+ end = start + alloc_size;
+
+ vunmap_range(start, end);
+
+ err = execmem_set_direct_map_valid(vm, false);
+ if (err)
+ goto err_free_mem;
+
+ err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
+ PMD_SHIFT);
+ if (err)
+ goto err_free_mem;
+
+ err = execmem_cache_add(p, alloc_size);
+ if (err)
+ goto err_free_mem;
+
+ return 0;
+
+err_free_mem:
+ vfree(p);
+ return err;
+}
+
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+ void *p;
+ int err;
+
+ p = __execmem_cache_alloc(range, size);
+ if (p)
+ return p;
+
+ err = execmem_cache_populate(range, size);
+ if (err)
+ return NULL;
+
+ return __execmem_cache_alloc(range, size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ struct mutex *mutex = &execmem_cache.mutex;
+ unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, busy_areas, addr, addr);
+ size_t size;
+ void *area;
+
+ mutex_lock(mutex);
+ area = mas_walk(&mas);
+ if (!area) {
+ mutex_unlock(mutex);
+ return false;
+ }
+ size = mas_range_len(&mas);
+
+ mas_store_gfp(&mas, NULL, GFP_KERNEL);
+ mutex_unlock(mutex);
+
+ execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+
+ execmem_cache_add(ptr, size);
+
+ schedule_work(&execmem_cache_clean_work);
+
+ return true;
+}
+#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+ return NULL;
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+ return false;
}
+#endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */
void *execmem_alloc(enum execmem_type type, size_t size)
{
struct execmem_range *range = &execmem_info->ranges[type];
+ bool use_cache = range->flags & EXECMEM_ROX_CACHE;
+ unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+ pgprot_t pgprot = range->pgprot;
+ void *p;
- return __execmem_alloc(range, size);
+ if (use_cache)
+ p = execmem_cache_alloc(range, size);
+ else
+ p = execmem_vmalloc(range, size, pgprot, vm_flags);
+
+ return kasan_reset_tag(p);
}
void execmem_free(void *ptr)
@@ -66,7 +379,19 @@ void execmem_free(void *ptr)
* supported by vmalloc.
*/
WARN_ON(in_interrupt());
- vfree(ptr);
+
+ if (!execmem_cache_free(ptr))
+ vfree(ptr);
+}
+
+void *execmem_update_copy(void *dst, const void *src, size_t size)
+{
+ return text_poke_copy(dst, src, size);
+}
+
+bool execmem_is_rox(enum execmem_type type)
+{
+ return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
}
static bool execmem_validate(struct execmem_info *info)
@@ -78,6 +403,17 @@ static bool execmem_validate(struct execmem_info *info)
return false;
}
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) {
+ for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) {
+ r = &info->ranges[i];
+
+ if (r->flags & EXECMEM_ROX_CACHE) {
+ pr_warn_once("ROX cache is not supported\n");
+ r->flags &= ~EXECMEM_ROX_CACHE;
+ }
+ }
+ }
+
return true;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 196779e8e396..7c76a123ba18 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,7 +119,6 @@
* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
* ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
- * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
@@ -3260,8 +3259,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return 0;
- ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
+ ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
if (unlikely(!ptep))
return VM_FAULT_NOPAGE;
diff --git a/mm/gup.c b/mm/gup.c
index 2b84eead9baa..746070a1d8bf 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -922,14 +922,14 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
- !pte_dirty(pte) && !PageDirty(page))
- set_page_dirty(page);
+ !pte_dirty(pte) && !folio_test_dirty(folio))
+ folio_mark_dirty(folio);
/*
* pte_mkyoung() would be more correct here, but atomic care
* is needed to avoid losing the dirty bit: it is easier to use
- * mark_page_accessed().
+ * folio_mark_accessed().
*/
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
}
out:
pte_unmap_unlock(ptep, ptl);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5734d5d5060f..ee335d96fc39 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
@@ -84,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ struct inode *inode;
+
+ if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return false;
+
+ if (!vma->vm_file)
+ return false;
+
+ inode = file_inode(vma->vm_file);
+
+ return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
+}
+
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
unsigned long tva_flags,
@@ -601,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
+DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -619,6 +635,8 @@ static struct attribute *anon_stats_attrs[] = {
&anon_fault_fallback_attr.attr,
&anon_fault_fallback_charge_attr.attr,
#ifndef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -649,6 +667,8 @@ static struct attribute_group file_stats_attr_grp = {
static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -938,26 +958,6 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline int get_order_from_str(const char *size_str)
-{
- unsigned long size;
- char *endptr;
- int order;
-
- size = memparse(size_str, &endptr);
-
- if (!is_power_of_2(size))
- goto err;
- order = get_order(size);
- if (BIT(order) & ~THP_ORDERS_ALL_ANON)
- goto err;
-
- return order;
-err:
- pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
- return -EINVAL;
-}
-
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
@@ -969,7 +969,7 @@ static int __init setup_thp_anon(char *str)
if (!str || strlen(str) + 1 > PAGE_SIZE)
goto err;
- strcpy(str_dup, str);
+ strscpy(str_dup, str);
always = huge_anon_orders_always;
madvise = huge_anon_orders_madvise;
@@ -987,10 +987,22 @@ static int __init setup_thp_anon(char *str)
start_size = strsep(&subtoken, "-");
end_size = subtoken;
- start = get_order_from_str(start_size);
- end = get_order_from_str(end_size);
+ start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
+ end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
} else {
- start = end = get_order_from_str(subtoken);
+ start_size = end_size = subtoken;
+ start = end = get_order_from_str(subtoken,
+ THP_ORDERS_ALL_ANON);
+ }
+
+ if (start == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
+ goto err;
+ }
+
+ if (end == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
+ goto err;
}
if (start < 0 || end < 0 || start > end)
@@ -1137,47 +1149,87 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
- struct page *page, gfp_t gfp)
+static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio = page_folio(page);
- pgtable_t pgtable;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- vm_fault_t ret = 0;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ const int order = HPAGE_PMD_ORDER;
+ struct folio *folio;
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
+ if (unlikely(!folio)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ return NULL;
+ }
+
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
- return VM_FAULT_FALLBACK;
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ return NULL;
}
folio_throttle_swaprate(folio, gfp);
- pgtable = pte_alloc_one(vma->vm_mm);
- if (unlikely(!pgtable)) {
- ret = VM_FAULT_OOM;
- goto release;
- }
-
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation (__GFP_ZERO not used),
+ * folio_zero_user() is used to make sure that the page corresponding
+ * to the faulting address will be hot in the cache after zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, addr);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* folio_zero_user writes become visible before the set_pmd_at()
* write.
*/
__folio_mark_uptodate(folio);
+ return folio;
+}
+
+static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr)
+{
+ pmd_t entry;
+
+ entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(folio, vma);
+ set_pmd_at(vma->vm_mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, haddr, pmd);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
+ pgtable_t pgtable;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
goto unlock_release;
} else {
- pmd_t entry;
-
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_release;
@@ -1191,21 +1243,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
-
- entry = mk_huge_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
mm_inc_nr_ptes(vma->vm_mm);
deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
- count_vm_event(THP_FAULT_ALLOC);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
- count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -1272,8 +1314,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- gfp_t gfp;
- struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret;
@@ -1324,14 +1364,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
return ret;
}
- gfp = vma_thp_gfp_mask(vma);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
- if (unlikely(!folio)) {
- count_vm_event(THP_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
- return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+ return __do_huge_pmd_anonymous_page(vmf);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1763,6 +1797,38 @@ unlock:
spin_unlock(vmf->ptl);
}
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+ struct folio *folio;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+ goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+ (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ goto unlock;
+release:
+ folio_put(folio);
+unlock:
+ spin_unlock(vmf->ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ return ret;
+}
+
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1775,8 +1841,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
+ vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+
+ /* Fallback to splitting PMD if THP cannot be allocated */
goto fallback;
+ }
spin_lock(vmf->ptl);
@@ -3124,8 +3197,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
/* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
- page_tail->mapping = head->mapping;
- page_tail->index = head->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + tail;
/*
* page->private should not be set in tail pages. Fix up and warn once
@@ -3201,11 +3274,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
+ struct folio *tail;
__split_huge_page_tail(folio, i, lruvec, list, new_order);
+ tail = page_folio(head + i);
/* Some pages can be beyond EOF: drop them from page cache */
- if (head[i].index >= end) {
- struct folio *tail = page_folio(head + i);
-
+ if (tail->index >= end) {
if (shmem_mapping(folio->mapping))
nr_dropped++;
else if (folio_test_clear_dirty(tail))
@@ -3213,12 +3286,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
folio_put(tail);
- } else if (!PageAnon(page)) {
- __xa_store(&folio->mapping->i_pages, head[i].index,
- head + i, 0);
+ } else if (!folio_test_anon(folio)) {
+ __xa_store(&folio->mapping->i_pages, tail->index,
+ tail, 0);
} else if (swap_cache) {
__xa_store(&swap_cache->i_pages, offset + i,
- head + i, 0);
+ tail, 0);
}
}
@@ -4096,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
tok = strsep(&buf, ",");
if (tok) {
- strcpy(file_path, tok);
+ strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 190fa05635f4..ea2ed8e301ef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1925,6 +1925,7 @@ void free_huge_folio(struct folio *folio)
pages_per_huge_page(h), folio);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
+ lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
mem_cgroup_uncharge(folio);
if (restore_reserve)
h->resv_huge_pages++;
@@ -3093,6 +3094,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
if (!memcg_charge_ret)
mem_cgroup_commit_charge(folio, memcg);
+ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
mem_cgroup_put(memcg);
return folio;
@@ -3301,6 +3303,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
unsigned long i;
char buf[32];
+ LIST_HEAD(folio_list);
for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
if (hstate_is_gigantic(h)) {
@@ -3310,14 +3313,18 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
struct folio *folio;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
- &node_states[N_MEMORY]);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
if (!folio)
break;
- free_huge_folio(folio); /* free it into the hugepage allocator */
+ list_add(&folio->lru, &folio_list);
}
cond_resched();
}
+
+ if (!list_empty(&folio_list))
+ prep_and_add_allocated_folios(h, &folio_list);
+
if (i == h->max_huge_pages_node[nid])
return;
@@ -6348,6 +6355,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
+ /* This isn't supported in hugetlb. */
+ ret = VM_FAULT_SIGSEGV;
+ goto out_mutex;
}
}
diff --git a/mm/internal.h b/mm/internal.h
index 64c2eb0b160e..5a7302baeed7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
+#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -841,7 +842,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
}
/* mm/util.c */
-struct anon_vma *folio_anon_vma(struct folio *folio);
+struct anon_vma *folio_anon_vma(const struct folio *folio);
#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
@@ -959,7 +960,7 @@ extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
* If any page in this range is mapped by this VMA, return the first address
* where any of these pages appear. Otherwise, return -EFAULT.
*/
-static inline unsigned long vma_address(struct vm_area_struct *vma,
+static inline unsigned long vma_address(const struct vm_area_struct *vma,
pgoff_t pgoff, unsigned long nr_pages)
{
unsigned long address;
@@ -1117,10 +1118,11 @@ void ClearPageHWPoisonTakenOff(struct page *page);
bool take_page_off_buddy(struct page *page);
bool put_page_back_buddy(struct page *page);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long ksm_addr);
-unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+unsigned long page_mapped_in_vma(const struct page *page,
+ struct vm_area_struct *vma);
#else
static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
@@ -1234,6 +1236,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift);
+unsigned int get_vm_area_page_order(struct vm_struct *vm);
#else
static inline void vmalloc_init(void)
{
@@ -1262,6 +1265,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_folio(struct folio *folio);
+struct vm_struct *__get_vm_area_node(unsigned long size,
+ unsigned long align, unsigned long shift,
+ unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask,
+ const void *caller);
+
/*
* mm/gup.c
*/
@@ -1276,6 +1285,34 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
+static inline bool alloc_zeroed(void)
+{
+ return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+ &init_on_alloc);
+}
+
+/*
+ * Parses a string with mem suffixes into its order. Useful to parse kernel
+ * parameters.
+ */
+static inline int get_order_from_str(const char *size_str,
+ unsigned long valid_orders)
+{
+ unsigned long size;
+ char *endptr;
+ int order;
+
+ size = memparse(size_str, &endptr);
+
+ if (!is_power_of_2(size))
+ return -EINVAL;
+ order = get_order(size);
+ if (BIT(order) & ~valid_orders)
+ return -EINVAL;
+
+ return order;
+}
+
enum {
/* mark page accessed */
FOLL_TOUCH = 1 << 16,
@@ -1356,7 +1393,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
smp_rmb();
/*
- * Note that PageKsm() pages cannot be exclusive, and consequently,
+ * Note that KSM pages cannot be exclusive, and consequently,
* cannot get pinned.
*/
return !PageAnonExclusive(page);
@@ -1488,4 +1525,9 @@ static inline void accept_page(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+/* pagewalk.c */
+int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index b88543e5c0cc..1a958e7c8a46 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -46,7 +46,6 @@ endif
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
-CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
obj-y := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
@@ -59,4 +58,3 @@ ifdef CONFIG_RUST
endif
obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o
-obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9958ebc15d38..ccd66c7a4081 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "kasan: " fmt
+#include <kunit/visibility.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
@@ -394,12 +395,12 @@ void kasan_enable_hw_tags(void)
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-EXPORT_SYMBOL_GPL(kasan_enable_hw_tags);
+EXPORT_SYMBOL_IF_KUNIT(kasan_enable_hw_tags);
-void kasan_force_async_fault(void)
+VISIBLE_IF_KUNIT void kasan_force_async_fault(void)
{
hw_force_async_tag_fault();
}
-EXPORT_SYMBOL_GPL(kasan_force_async_fault);
+EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault);
#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ac607c306292..ced6b29fcf76 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -106,10 +106,6 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
}
}
-void __weak __meminit kernel_pte_init(void *addr)
-{
-}
-
static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
unsigned long end)
{
@@ -145,10 +141,6 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
return 0;
}
-void __weak __meminit pmd_init(void *addr)
-{
-}
-
static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
unsigned long end)
{
@@ -187,10 +179,6 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
return 0;
}
-void __weak __meminit pud_init(void *addr)
-{
-}
-
static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
unsigned long end)
{
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index f438a6cdc964..b7e4b81421b3 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -568,7 +568,7 @@ static inline void kasan_kunit_test_suite_end(void) { }
#endif /* CONFIG_KASAN_KUNIT_TEST */
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
bool kasan_save_enable_multi_shot(void);
void kasan_restore_multi_shot(bool enabled);
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index d8fb281e439d..e0ec5a6d15be 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -33,6 +33,8 @@
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+
static bool multishot;
/* Fields set based on lines observed in the console. */
@@ -213,6 +215,36 @@ static void kmalloc_node_oob_right(struct kunit *test)
kfree(ptr);
}
+static void kmalloc_track_caller_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ /*
+ * Check that KASAN detects out-of-bounds access for object allocated via
+ * kmalloc_track_caller().
+ */
+ ptr = kmalloc_track_caller(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'y');
+
+ kfree(ptr);
+
+ /*
+ * Check that KASAN detects out-of-bounds access for object allocated via
+ * kmalloc_node_track_caller().
+ */
+ ptr = kmalloc_node_track_caller(size, GFP_KERNEL, 0);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'y');
+
+ kfree(ptr);
+}
+
/*
* Check that KASAN detects an out-of-bounds access for a big object allocated
* via kmalloc(). But not as big as to trigger the page_alloc fallback.
@@ -1928,10 +1960,92 @@ static void rust_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf());
}
+static void copy_to_kernel_nofault_oob(struct kunit *test)
+{
+ char *ptr;
+ char buf[128];
+ size_t size = sizeof(buf);
+
+ /*
+ * This test currently fails with the HW_TAGS mode. The reason is
+ * unknown and needs to be investigated.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
+ ptr = kmalloc(size - KASAN_GRANULE_SIZE, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ OPTIMIZER_HIDE_VAR(ptr);
+
+ /*
+ * We test copy_to_kernel_nofault() to detect corrupted memory that is
+ * being written into the kernel. In contrast,
+ * copy_from_kernel_nofault() is primarily used in kernel helper
+ * functions where the source address might be random or uninitialized.
+ * Applying KASAN instrumentation to copy_from_kernel_nofault() could
+ * lead to false positives. By focusing KASAN checks only on
+ * copy_to_kernel_nofault(), we ensure that only valid memory is
+ * written to the kernel, minimizing the risk of kernel corruption
+ * while avoiding false positives in the reverse case.
+ */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ copy_to_kernel_nofault(&buf[0], ptr, size));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ copy_to_kernel_nofault(ptr, &buf[0], size));
+
+ kfree(ptr);
+}
+
+static void copy_user_test_oob(struct kunit *test)
+{
+ char *kmem;
+ char __user *usermem;
+ unsigned long useraddr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+ int __maybe_unused unused;
+
+ kmem = kunit_kmalloc(test, size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, kmem);
+
+ useraddr = kunit_vm_mmap(test, NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ KUNIT_ASSERT_NE_MSG(test, useraddr, 0,
+ "Could not create userspace mm");
+ KUNIT_ASSERT_LT_MSG(test, useraddr, (unsigned long)TASK_SIZE,
+ "Failed to allocate user memory");
+
+ OPTIMIZER_HIDE_VAR(size);
+ usermem = (char __user *)useraddr;
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = copy_from_user(kmem, usermem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = copy_to_user(usermem, kmem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_from_user(kmem, usermem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_to_user(usermem, kmem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1));
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1));
+
+ /*
+ * Prepare a long string in usermem to avoid the strncpy_from_user test
+ * bailing out on '\0' before it reaches out-of-bounds.
+ */
+ memset(kmem, 'a', size);
+ KUNIT_EXPECT_EQ(test, copy_to_user(usermem, kmem, size), 0);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ unused = strncpy_from_user(kmem, usermem, size + 1));
+}
+
static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmalloc_oob_right),
KUNIT_CASE(kmalloc_oob_left),
KUNIT_CASE(kmalloc_node_oob_right),
+ KUNIT_CASE(kmalloc_track_caller_oob_right),
KUNIT_CASE(kmalloc_big_oob_right),
KUNIT_CASE(kmalloc_large_oob_right),
KUNIT_CASE(kmalloc_large_uaf),
@@ -1992,7 +2106,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_strings),
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
- KUNIT_CASE(kasan_atomics),
+ KUNIT_CASE_SLOW(kasan_atomics),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
@@ -2000,7 +2114,9 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
+ KUNIT_CASE(copy_to_kernel_nofault_oob),
KUNIT_CASE(rust_uaf),
+ KUNIT_CASE(copy_user_test_oob),
{}
};
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
deleted file mode 100644
index 27ec22767e42..000000000000
--- a/mm/kasan/kasan_test_module.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
- */
-
-#define pr_fmt(fmt) "kasan: test: " fmt
-
-#include <linux/mman.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include "kasan.h"
-
-static noinline void __init copy_user_test(void)
-{
- char *kmem;
- char __user *usermem;
- size_t size = 128 - KASAN_GRANULE_SIZE;
- int __maybe_unused unused;
-
- kmem = kmalloc(size, GFP_KERNEL);
- if (!kmem)
- return;
-
- usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_ANONYMOUS | MAP_PRIVATE, 0);
- if (IS_ERR(usermem)) {
- pr_err("Failed to allocate user memory\n");
- kfree(kmem);
- return;
- }
-
- OPTIMIZER_HIDE_VAR(size);
-
- pr_info("out-of-bounds in copy_from_user()\n");
- unused = copy_from_user(kmem, usermem, size + 1);
-
- pr_info("out-of-bounds in copy_to_user()\n");
- unused = copy_to_user(usermem, kmem, size + 1);
-
- pr_info("out-of-bounds in __copy_from_user()\n");
- unused = __copy_from_user(kmem, usermem, size + 1);
-
- pr_info("out-of-bounds in __copy_to_user()\n");
- unused = __copy_to_user(usermem, kmem, size + 1);
-
- pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
- unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
-
- pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
- unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
-
- pr_info("out-of-bounds in strncpy_from_user()\n");
- unused = strncpy_from_user(kmem, usermem, size + 1);
-
- vm_munmap((unsigned long)usermem, PAGE_SIZE);
- kfree(kmem);
-}
-
-static int __init kasan_test_module_init(void)
-{
- /*
- * Temporarily enable multi-shot mode. Otherwise, KASAN would only
- * report the first detected bug and panic the kernel if panic_on_warn
- * is enabled.
- */
- bool multishot = kasan_save_enable_multi_shot();
-
- copy_user_test();
-
- kasan_restore_multi_shot(multishot);
- return -EAGAIN;
-}
-
-module_init(kasan_test_module_init);
-MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b48c768acc84..50fb19ad4388 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -10,6 +10,7 @@
*/
#include <kunit/test.h>
+#include <kunit/visibility.h>
#include <linux/bitops.h>
#include <linux/ftrace.h>
#include <linux/init.h>
@@ -132,20 +133,20 @@ static bool report_enabled(void)
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
}
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-bool kasan_save_enable_multi_shot(void)
+VISIBLE_IF_KUNIT bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
-EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+EXPORT_SYMBOL_IF_KUNIT(kasan_save_enable_multi_shot);
-void kasan_restore_multi_shot(bool enabled)
+VISIBLE_IF_KUNIT void kasan_restore_multi_shot(bool enabled)
{
if (!enabled)
clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
-EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+EXPORT_SYMBOL_IF_KUNIT(kasan_restore_multi_shot);
#endif
@@ -157,17 +158,17 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
*/
static bool kasan_kunit_executing;
-void kasan_kunit_test_suite_start(void)
+VISIBLE_IF_KUNIT void kasan_kunit_test_suite_start(void)
{
WRITE_ONCE(kasan_kunit_executing, true);
}
-EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start);
+EXPORT_SYMBOL_IF_KUNIT(kasan_kunit_test_suite_start);
-void kasan_kunit_test_suite_end(void)
+VISIBLE_IF_KUNIT void kasan_kunit_test_suite_end(void)
{
WRITE_ONCE(kasan_kunit_executing, false);
}
-EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end);
+EXPORT_SYMBOL_IF_KUNIT(kasan_kunit_test_suite_end);
static bool kasan_kunit_test_suite_executing(void)
{
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
*/
void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
- unsigned long free_region_end)
+ unsigned long free_region_end,
+ unsigned long flags)
{
void *shadow_start, *shadow_end;
unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
return;
}
- apply_to_existing_page_range(&init_mm,
+
+
+ if (flags & KASAN_VMALLOC_PAGE_RANGE)
+ apply_to_existing_page_range(&init_mm,
(unsigned long)shadow_start,
size, kasan_depopulate_vmalloc_pte,
NULL);
- flush_tlb_kernel_range((unsigned long)shadow_start,
- (unsigned long)shadow_end);
+
+ if (flags & KASAN_VMALLOC_TLB_FLUSH)
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
}
}
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 00fd17285285..f65fb182466d 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -383,6 +383,22 @@ static void test_use_after_free_read(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
+static void test_use_after_free_read_nofault(struct kunit *test)
+{
+ const size_t size = 32;
+ char *addr;
+ char dst;
+ int ret;
+
+ setup_test_cache(test, size, 0, NULL);
+ addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(addr);
+ /* Use after free with *_nofault() */
+ ret = copy_from_kernel_nofault(&dst, addr, 1);
+ KUNIT_EXPECT_EQ(test, ret, -EFAULT);
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
static void test_double_free(struct kunit *test)
{
const size_t size = 32;
@@ -780,6 +796,7 @@ static struct kunit_case kfence_test_cases[] = {
KFENCE_KUNIT_CASE(test_out_of_bounds_read),
KFENCE_KUNIT_CASE(test_out_of_bounds_write),
KFENCE_KUNIT_CASE(test_use_after_free_read),
+ KFENCE_KUNIT_CASE(test_use_after_free_read_nofault),
KFENCE_KUNIT_CASE(test_double_free),
KFENCE_KUNIT_CASE(test_invalid_addr_free),
KFENCE_KUNIT_CASE(test_corruption),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b538c3d48386..6f8d46d107b4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -416,9 +415,11 @@ static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
static bool hugepage_pmd_enabled(void)
{
/*
- * We cover both the anon and the file-backed case here; file-backed
+ * We cover the anon, shmem and the file-backed case here; file-backed
* hugepages, when configured in, are determined by the global control.
* Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Shmem pmd-sized hugepages are also determined by its pmd-size control,
+ * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled())
@@ -430,6 +431,8 @@ static bool hugepage_pmd_enabled(void)
if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
hugepage_global_enabled())
return true;
+ if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
+ return true;
return false;
}
@@ -1011,7 +1014,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
};
if (!pte++) {
- pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ /*
+ * Here the ptl is only used to check pte_same() in
+ * do_swap_page(), so readonly version is enough.
+ */
+ pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
@@ -1601,7 +1608,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1609,6 +1616,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1641,7 +1651,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
nr_ptes++;
}
- pte_unmap(start_pte);
if (!pml)
spin_unlock(ptl);
@@ -1654,14 +1663,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
- if (ptl != pml)
+ if (ptl != pml) {
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ flush_tlb_mm(mm);
+ goto unlock;
+ }
+ }
}
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
+ pte_unmap_unlock(start_pte, ptl);
if (ptl != pml)
- spin_unlock(ptl);
- spin_unlock(pml);
+ spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
@@ -1681,6 +1695,7 @@ abort:
folio_ref_sub(folio, nr_ptes);
add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
+unlock:
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
if (pml && pml != ptl)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0400f5e8ac60..2a945c07ae99 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -935,6 +935,28 @@ static void make_black_object(unsigned long ptr, unsigned int objflags)
}
/*
+ * Reset the checksum of an object. The immediate effect is that it will not
+ * be reported as a leak during the next scan until its checksum is updated.
+ */
+static void reset_checksum(unsigned long ptr)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Not resetting the checksum of an unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->checksum = 0;
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
* Add a scanning area to the object. If at least one such area is added,
* kmemleak will only scan these ranges rather than the whole memory block.
*/
@@ -1011,7 +1033,7 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
}
/*
- * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the given
* pointer. Such object will not be scanned by kmemleak but references to it
* are searched.
*/
@@ -1203,6 +1225,23 @@ void __ref kmemleak_not_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_not_leak);
/**
+ * kmemleak_transient_leak - mark an allocated object as transient false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to not be
+ * reported as a leak temporarily. This may happen, for example, if the object
+ * is part of a singly linked list and the ->next reference to it is changed.
+ */
+void __ref kmemleak_transient_leak(const void *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ reset_checksum((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_transient_leak);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 13236d579eba..9733a22c46c1 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -640,6 +640,22 @@ static void test_unpoison_memory(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
+static void test_copy_from_kernel_nofault(struct kunit *test)
+{
+ long ret;
+ char buf[4], src[4];
+ size_t size = sizeof(buf);
+
+ EXPECTATION_UNINIT_VALUE_FN(expect, "copy_from_kernel_nofault");
+ kunit_info(
+ test,
+ "testing copy_from_kernel_nofault with uninitialized memory\n");
+
+ ret = copy_from_kernel_nofault((char *)&buf[0], (char *)&src[0], size);
+ USE(ret);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_uninit_kmalloc),
KUNIT_CASE(test_init_kmalloc),
@@ -664,6 +680,7 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_long_origin_chain),
KUNIT_CASE(test_stackdepot_roundtrip),
KUNIT_CASE(test_unpoison_memory),
+ KUNIT_CASE(test_copy_from_kernel_nofault),
{},
};
diff --git a/mm/ksm.c b/mm/ksm.c
index a2e2a521df0a..31a9bc365437 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -20,7 +20,6 @@
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/cputime.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
@@ -657,7 +656,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
- * okay, that truncation will have unmapped the PageKsm for us.
+ * okay, that truncation will have unmapped the KSM page for us.
*
* VM_FAULT_OOM: at the time of writing (late July 2009), setting
* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
@@ -1052,7 +1051,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
-static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
+static inline
+struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
@@ -1257,7 +1257,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
if (WARN_ON_ONCE(folio_test_large(folio)))
return err;
- pvmw.address = page_address_in_vma(&folio->page, vma);
+ pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma);
if (pvmw.address == -EFAULT)
goto out;
@@ -1341,7 +1341,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
{
struct folio *kfolio = page_folio(kpage);
struct mm_struct *mm = vma->vm_mm;
- struct folio *folio;
+ struct folio *folio = page_folio(page);
pmd_t *pmd;
pmd_t pmde;
pte_t *ptep;
@@ -1351,7 +1351,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
int err = -EFAULT;
struct mmu_notifier_range range;
- addr = page_address_in_vma(page, vma);
+ addr = page_address_in_vma(folio, page, vma);
if (addr == -EFAULT)
goto out;
@@ -1417,7 +1417,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at(mm, addr, ptep, newpte);
- folio = page_folio(page);
folio_remove_rmap_pte(folio, page, vma);
if (!folio_mapped(folio))
folio_free_swap(folio);
@@ -1435,7 +1434,7 @@ out:
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
* @page: the PageAnon page that we want to replace with kpage
- * @kpage: the PageKsm page that we want to map instead of page,
+ * @kpage: the KSM page that we want to map instead of page,
* or NULL the first time when we want to use page as kpage.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
@@ -1443,28 +1442,29 @@ out:
static int try_to_merge_one_page(struct vm_area_struct *vma,
struct page *page, struct page *kpage)
{
+ struct folio *folio = page_folio(page);
pte_t orig_pte = __pte(0);
int err = -EFAULT;
if (page == kpage) /* ksm page forked */
return 0;
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
goto out;
/*
* We need the folio lock to read a stable swapcache flag in
- * write_protect_page(). We use trylock_page() instead of
- * lock_page() because we don't want to wait here - we
- * prefer to continue scanning and merging different pages,
- * then come back to this page when it is unlocked.
+ * write_protect_page(). We trylock because we don't want to wait
+ * here - we prefer to continue scanning and merging different
+ * pages, then come back to this page when it is unlocked.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
- if (PageTransCompound(page)) {
+ if (folio_test_large(folio)) {
if (split_huge_page(page))
goto out_unlock;
+ folio = page_folio(page);
}
/*
@@ -1473,28 +1473,28 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
- if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) {
+ if (write_protect_page(vma, folio, &orig_pte) == 0) {
if (!kpage) {
/*
- * While we hold page lock, upgrade page from
- * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * While we hold folio lock, upgrade folio from
+ * anon to a NULL stable_node with the KSM flag set:
* stable_tree_insert() will update stable_node.
*/
- folio_set_stable_node(page_folio(page), NULL);
- mark_page_accessed(page);
+ folio_set_stable_node(folio, NULL);
+ folio_mark_accessed(folio);
/*
- * Page reclaim just frees a clean page with no dirty
+ * Page reclaim just frees a clean folio with no dirty
* ptes: make sure that the ksm page would be swapped.
*/
- if (!PageDirty(page))
- SetPageDirty(page);
+ if (!folio_test_dirty(folio))
+ folio_mark_dirty(folio);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
}
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
out:
return err;
}
@@ -1582,7 +1582,7 @@ out:
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
-static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
+static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
struct page *page,
struct ksm_rmap_item *tree_rmap_item,
struct page *tree_page)
@@ -1600,7 +1600,7 @@ static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
if (err)
break_cow(rmap_item);
}
- return err ? NULL : page;
+ return err ? NULL : page_folio(page);
}
static __always_inline
@@ -1787,9 +1787,9 @@ static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
* with identical content to the page that we are scanning right now.
*
* This function returns the stable tree node of identical content if found,
- * NULL otherwise.
+ * -EBUSY if the stable node's page is being migrated, NULL otherwise.
*/
-static struct page *stable_tree_search(struct page *page)
+static struct folio *stable_tree_search(struct page *page)
{
int nid;
struct rb_root *root;
@@ -1804,7 +1804,7 @@ static struct page *stable_tree_search(struct page *page)
if (page_node && page_node->head != &migrate_nodes) {
/* ksm page forked */
folio_get(folio);
- return &folio->page;
+ return folio;
}
nid = get_kpfn_nid(folio_pfn(folio));
@@ -1899,7 +1899,7 @@ again:
folio_put(tree_folio);
goto replace;
}
- return &tree_folio->page;
+ return tree_folio;
}
}
@@ -1913,7 +1913,7 @@ again:
out:
if (is_page_sharing_candidate(page_node)) {
folio_get(folio);
- return &folio->page;
+ return folio;
} else
return NULL;
@@ -1963,7 +1963,7 @@ replace:
}
stable_node_dup->head = &migrate_nodes;
list_add(&stable_node_dup->list, stable_node_dup->head);
- return &folio->page;
+ return folio;
chain_append:
/*
@@ -2217,7 +2217,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct ksm_stable_node *stable_node;
- struct page *kpage;
+ struct folio *kfolio;
unsigned int checksum;
int err;
bool max_page_sharing_bypass = false;
@@ -2259,31 +2259,31 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
return;
}
- /* We first start with searching the page inside the stable tree */
- kpage = stable_tree_search(page);
- if (kpage == page && rmap_item->head == stable_node) {
- put_page(kpage);
+ /* Start by searching for the folio in the stable tree */
+ kfolio = stable_tree_search(page);
+ if (&kfolio->page == page && rmap_item->head == stable_node) {
+ folio_put(kfolio);
return;
}
remove_rmap_item_from_tree(rmap_item);
- if (kpage) {
- if (PTR_ERR(kpage) == -EBUSY)
+ if (kfolio) {
+ if (kfolio == ERR_PTR(-EBUSY))
return;
- err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
+ err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page);
if (!err) {
/*
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
- lock_page(kpage);
- stable_tree_append(rmap_item, page_stable_node(kpage),
+ folio_lock(kfolio);
+ stable_tree_append(rmap_item, folio_stable_node(kfolio),
max_page_sharing_bypass);
- unlock_page(kpage);
+ folio_unlock(kfolio);
}
- put_page(kpage);
+ folio_put(kfolio);
return;
}
@@ -2292,7 +2292,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
if (tree_rmap_item) {
bool split;
- kpage = try_to_merge_two_pages(rmap_item, page,
+ kfolio = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
/*
* If both pages we tried to merge belong to the same compound
@@ -2307,20 +2307,20 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
split = PageTransCompound(page)
&& compound_head(page) == compound_head(tree_page);
put_page(tree_page);
- if (kpage) {
+ if (kfolio) {
/*
* The pages were successfully merged: insert new
* node in the stable tree and add both rmap_items.
*/
- lock_page(kpage);
- stable_node = stable_tree_insert(page_folio(kpage));
+ folio_lock(kfolio);
+ stable_node = stable_tree_insert(kfolio);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node,
false);
stable_tree_append(rmap_item, stable_node,
false);
}
- unlock_page(kpage);
+ folio_unlock(kfolio);
/*
* If we fail to insert the page into the stable tree,
@@ -2401,10 +2401,10 @@ static unsigned int skip_age(rmap_age_t age)
/*
* Determines if a page should be skipped for the current scan.
*
- * @page: page to check
+ * @folio: folio containing the page to check
* @rmap_item: associated rmap_item of page
*/
-static bool should_skip_rmap_item(struct page *page,
+static bool should_skip_rmap_item(struct folio *folio,
struct ksm_rmap_item *rmap_item)
{
rmap_age_t age;
@@ -2417,7 +2417,7 @@ static bool should_skip_rmap_item(struct page *page,
* will essentially ignore them, but we still have to process them
* properly.
*/
- if (PageKsm(page))
+ if (folio_test_ksm(folio))
return false;
age = rmap_item->age;
@@ -2560,7 +2560,7 @@ next_mm:
ksm_scan.rmap_list =
&rmap_item->rmap_list;
- if (should_skip_rmap_item(tmp_page, rmap_item)) {
+ if (should_skip_rmap_item(folio, rmap_item)) {
folio_put(folio);
goto next_page;
}
@@ -2970,7 +2970,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
if (!folio_test_uptodate(folio))
return folio; /* let do_swap_page report the error */
- new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
if (new_folio &&
mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
folio_put(new_folio);
@@ -3067,7 +3067,7 @@ again:
/*
* Collect processes when the error hit an ksm page.
*/
-void collect_procs_ksm(struct folio *folio, struct page *page,
+void collect_procs_ksm(const struct folio *folio, const struct page *page,
struct list_head *to_kill, int force_early)
{
struct ksm_stable_node *stable_node;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9b7ff06e9d32..f93ada6a207b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -59,6 +59,53 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
}
return &lru->node[nid].lru;
}
+
+static inline struct list_lru_one *
+lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ bool irq, bool skip_empty)
+{
+ struct list_lru_one *l;
+ long nr_items;
+
+ rcu_read_lock();
+again:
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+ if (likely(l)) {
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+ nr_items = READ_ONCE(l->nr_items);
+ if (likely(nr_items != LONG_MIN)) {
+ WARN_ON(nr_items < 0);
+ rcu_read_unlock();
+ return l;
+ }
+ if (irq)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+ }
+ /*
+ * Caller may simply bail out if raced with reparenting or
+ * may iterate through the list_lru and expect empty slots.
+ */
+ if (skip_empty) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ VM_WARN_ON(!css_is_dying(&memcg->css));
+ memcg = parent_mem_cgroup(memcg);
+ goto again;
+}
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+{
+ if (irq_off)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+}
#else
static void list_lru_register(struct list_lru *lru)
{
@@ -83,30 +130,52 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
return &lru->node[nid].lru;
}
+
+static inline struct list_lru_one *
+lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ bool irq, bool skip_empty)
+{
+ struct list_lru_one *l = &lru->node[nid].lru;
+
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+
+ return l;
+}
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+{
+ if (irq_off)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+}
#endif /* CONFIG_MEMCG */
/* The caller must ensure the memcg lifetime. */
bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
- spin_lock(&nlru->lock);
+ l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
+ if (!l)
+ return false;
if (list_empty(item)) {
- l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
- nlru->nr_items++;
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
+ atomic_long_inc(&nlru->nr_items);
return true;
}
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
return false;
}
-EXPORT_SYMBOL_GPL(list_lru_add);
bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
@@ -127,24 +196,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj);
/* The caller must ensure the memcg lifetime. */
bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
-
- spin_lock(&nlru->lock);
+ l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
+ if (!l)
+ return false;
if (!list_empty(item)) {
- l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
list_del_init(item);
l->nr_items--;
- nlru->nr_items--;
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
+ atomic_long_dec(&nlru->nr_items);
return true;
}
- spin_unlock(&nlru->lock);
+ unlock_list_lru(l, false);
return false;
}
-EXPORT_SYMBOL_GPL(list_lru_del);
bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
{
@@ -201,25 +269,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
struct list_lru_node *nlru;
nlru = &lru->node[nid];
- return nlru->nr_items;
+ return atomic_long_read(&nlru->nr_items);
}
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
- unsigned long *nr_to_walk)
+ unsigned long *nr_to_walk, bool irq_off)
{
struct list_lru_node *nlru = &lru->node[nid];
- struct list_lru_one *l;
+ struct list_lru_one *l = NULL;
struct list_head *item, *n;
unsigned long isolated = 0;
restart:
- l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
+ l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true);
if (!l)
- goto out;
-
+ return isolated;
list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
@@ -231,19 +298,19 @@ restart:
break;
--*nr_to_walk;
- ret = isolate(item, l, &nlru->lock, cb_arg);
+ ret = isolate(item, l, cb_arg);
switch (ret) {
+ /*
+ * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru
+ * lock. List traversal will have to restart from scratch.
+ */
+ case LRU_RETRY:
+ goto restart;
case LRU_REMOVED_RETRY:
- assert_spin_locked(&nlru->lock);
fallthrough;
case LRU_REMOVED:
isolated++;
- nlru->nr_items--;
- /*
- * If the lru lock has been dropped, our list
- * traversal is now invalid and so we have to
- * restart from scratch.
- */
+ atomic_long_dec(&nlru->nr_items);
if (ret == LRU_REMOVED_RETRY)
goto restart;
break;
@@ -252,20 +319,13 @@ restart:
break;
case LRU_SKIP:
break;
- case LRU_RETRY:
- /*
- * The lru lock has been dropped, our list traversal is
- * now invalid and so we have to restart from scratch.
- */
- assert_spin_locked(&nlru->lock);
- goto restart;
case LRU_STOP:
- assert_spin_locked(&nlru->lock);
goto out;
default:
BUG();
}
}
+ unlock_list_lru(l, irq_off);
out:
return isolated;
}
@@ -275,14 +335,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
- unsigned long ret;
-
- spin_lock(&nlru->lock);
- ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
- cb_arg, nr_to_walk);
- spin_unlock(&nlru->lock);
- return ret;
+ return __list_lru_walk_one(lru, nid, memcg, isolate,
+ cb_arg, nr_to_walk, false);
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);
@@ -291,14 +345,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
- unsigned long ret;
-
- spin_lock_irq(&nlru->lock);
- ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
- cb_arg, nr_to_walk);
- spin_unlock_irq(&nlru->lock);
- return ret;
+ return __list_lru_walk_one(lru, nid, memcg, isolate,
+ cb_arg, nr_to_walk, true);
}
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
@@ -313,16 +361,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
#ifdef CONFIG_MEMCG
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
struct list_lru_memcg *mlru;
+ struct mem_cgroup *memcg;
unsigned long index;
xa_for_each(&lru->xa, index, mlru) {
- struct list_lru_node *nlru = &lru->node[nid];
-
- spin_lock(&nlru->lock);
- isolated += __list_lru_walk_one(lru, nid, index,
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(index);
+ if (!mem_cgroup_tryget(memcg)) {
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+ isolated += __list_lru_walk_one(lru, nid, memcg,
isolate, cb_arg,
- nr_to_walk);
- spin_unlock(&nlru->lock);
+ nr_to_walk, false);
+ mem_cgroup_put(memcg);
if (*nr_to_walk <= 0)
break;
@@ -334,14 +387,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-static void init_one_lru(struct list_lru_one *l)
+static void init_one_lru(struct list_lru *lru, struct list_lru_one *l)
{
INIT_LIST_HEAD(&l->list);
+ spin_lock_init(&l->lock);
l->nr_items = 0;
+#ifdef CONFIG_LOCKDEP
+ if (lru->key)
+ lockdep_set_class(&l->lock, lru->key);
+#endif
}
#ifdef CONFIG_MEMCG
-static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
+static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp)
{
int nid;
struct list_lru_memcg *mlru;
@@ -351,25 +409,11 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
return NULL;
for_each_node(nid)
- init_one_lru(&mlru->node[nid]);
+ init_one_lru(lru, &mlru->node[nid]);
return mlru;
}
-static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
-{
- struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
-
- /*
- * The __list_lru_walk_one() can walk the list of this node.
- * We need kvfree_rcu() here. And the walking of the list
- * is under lru->node[nid]->lock, which can serve as a RCU
- * read-side critical section.
- */
- if (mlru)
- kvfree_rcu(mlru, rcu);
-}
-
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
if (memcg_aware)
@@ -393,77 +437,64 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
xas_unlock_irq(&xas);
}
-static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
- int src_idx, struct mem_cgroup *dst_memcg)
+static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
+ struct list_lru_one *src,
+ struct mem_cgroup *dst_memcg)
{
- struct list_lru_node *nlru = &lru->node[nid];
int dst_idx = dst_memcg->kmemcg_id;
- struct list_lru_one *src, *dst;
-
- /*
- * Since list_lru_{add,del} may be called under an IRQ-safe lock,
- * we have to use IRQ-safe primitives here to avoid deadlock.
- */
- spin_lock_irq(&nlru->lock);
+ struct list_lru_one *dst;
- src = list_lru_from_memcg_idx(lru, nid, src_idx);
- if (!src)
- goto out;
+ spin_lock_irq(&src->lock);
dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
+ spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING);
list_splice_init(&src->list, &dst->list);
-
if (src->nr_items) {
dst->nr_items += src->nr_items;
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
- src->nr_items = 0;
}
-out:
- spin_unlock_irq(&nlru->lock);
-}
-
-static void memcg_reparent_list_lru(struct list_lru *lru,
- int src_idx, struct mem_cgroup *dst_memcg)
-{
- int i;
+ /* Mark the list_lru_one dead */
+ src->nr_items = LONG_MIN;
- for_each_node(i)
- memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg);
-
- memcg_list_lru_free(lru, src_idx);
+ spin_unlock(&dst->lock);
+ spin_unlock_irq(&src->lock);
}
void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
- struct cgroup_subsys_state *css;
struct list_lru *lru;
- int src_idx = memcg->kmemcg_id;
+ int i;
- /*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent.
- *
- * After we have finished, all list_lrus corresponding to this cgroup
- * are guaranteed to remain empty. So we can safely free this cgroup's
- * list lrus in memcg_list_lru_free().
- *
- * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
- * from allocating list lrus for this cgroup after memcg_list_lru_free()
- * call.
- */
- rcu_read_lock();
- css_for_each_descendant_pre(css, &memcg->css) {
- struct mem_cgroup *child;
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &memcg_list_lrus, list) {
+ struct list_lru_memcg *mlru;
+ XA_STATE(xas, &lru->xa, memcg->kmemcg_id);
- child = mem_cgroup_from_css(css);
- WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
- }
- rcu_read_unlock();
+ /*
+ * Lock the Xarray to ensure no on going list_lru_memcg
+ * allocation and further allocation will see css_is_dying().
+ */
+ xas_lock_irq(&xas);
+ mlru = xas_store(&xas, NULL);
+ xas_unlock_irq(&xas);
+ if (!mlru)
+ continue;
- mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &memcg_list_lrus, list)
- memcg_reparent_list_lru(lru, src_idx, parent);
+ /*
+ * With Xarray value set to NULL, holding the lru lock below
+ * prevents list_lru_{add,del,isolate} from touching the lru,
+ * safe to reparent.
+ */
+ for_each_node(i)
+ memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent);
+
+ /*
+ * Here all list_lrus corresponding to the cgroup are guaranteed
+ * to remain empty, we can safely free this lru, any further
+ * memcg_list_lru_alloc() call will simply bail out.
+ */
+ kvfree_rcu(mlru, rcu);
+ }
mutex_unlock(&list_lrus_mutex);
}
@@ -478,77 +509,48 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp)
{
- int i;
unsigned long flags;
- struct list_lru_memcg_table {
- struct list_lru_memcg *mlru;
- struct mem_cgroup *memcg;
- } *table;
+ struct list_lru_memcg *mlru;
+ struct mem_cgroup *pos, *parent;
XA_STATE(xas, &lru->xa, 0);
if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
return 0;
gfp &= GFP_RECLAIM_MASK;
- table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
- if (!table)
- return -ENOMEM;
-
/*
* Because the list_lru can be reparented to the parent cgroup's
* list_lru, we should make sure that this cgroup and all its
* ancestors have allocated list_lru_memcg.
*/
- for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
- if (memcg_list_lru_allocated(memcg, lru))
- break;
-
- table[i].memcg = memcg;
- table[i].mlru = memcg_init_list_lru_one(gfp);
- if (!table[i].mlru) {
- while (i--)
- kfree(table[i].mlru);
- kfree(table);
- return -ENOMEM;
+ do {
+ /*
+ * Keep finding the farest parent that wasn't populated
+ * until found memcg itself.
+ */
+ pos = memcg;
+ parent = parent_mem_cgroup(pos);
+ while (!memcg_list_lru_allocated(parent, lru)) {
+ pos = parent;
+ parent = parent_mem_cgroup(pos);
}
- }
-
- xas_lock_irqsave(&xas, flags);
- while (i--) {
- int index = READ_ONCE(table[i].memcg->kmemcg_id);
- struct list_lru_memcg *mlru = table[i].mlru;
- xas_set(&xas, index);
-retry:
- if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
- kfree(mlru);
- } else {
- xas_store(&xas, mlru);
- if (xas_error(&xas) == -ENOMEM) {
- xas_unlock_irqrestore(&xas, flags);
- if (xas_nomem(&xas, gfp))
- xas_set_err(&xas, 0);
- xas_lock_irqsave(&xas, flags);
- /*
- * The xas lock has been released, this memcg
- * can be reparented before us. So reload
- * memcg id. More details see the comments
- * in memcg_reparent_list_lrus().
- */
- index = READ_ONCE(table[i].memcg->kmemcg_id);
- if (index < 0)
- xas_set_err(&xas, 0);
- else if (!xas_error(&xas) && index != xas.xa_index)
- xas_set(&xas, index);
- goto retry;
+ mlru = memcg_init_list_lru_one(lru, gfp);
+ if (!mlru)
+ return -ENOMEM;
+ xas_set(&xas, pos->kmemcg_id);
+ do {
+ xas_lock_irqsave(&xas, flags);
+ if (!xas_load(&xas) && !css_is_dying(&pos->css)) {
+ xas_store(&xas, mlru);
+ if (!xas_error(&xas))
+ mlru = NULL;
}
- }
- }
- /* xas_nomem() is used to free memory instead of memory allocation. */
- if (xas.xa_alloc)
- xas_nomem(&xas, gfp);
- xas_unlock_irqrestore(&xas, flags);
- kfree(table);
+ xas_unlock_irqrestore(&xas, flags);
+ } while (xas_nomem(&xas, gfp));
+ if (mlru)
+ kfree(mlru);
+ } while (pos != memcg && !css_is_dying(&pos->css));
return xas_error(&xas);
}
@@ -562,8 +564,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
}
#endif /* CONFIG_MEMCG */
-int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key, struct shrinker *shrinker)
+int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker)
{
int i;
@@ -581,12 +582,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
if (!lru->node)
return -ENOMEM;
- for_each_node(i) {
- spin_lock_init(&lru->node[i].lock);
- if (key)
- lockdep_set_class(&lru->node[i].lock, key);
- init_one_lru(&lru->node[i].lru);
- }
+ for_each_node(i)
+ init_one_lru(lru, &lru->node[i].lru);
memcg_init_list_lru(lru, memcg_aware);
list_lru_register(lru);
diff --git a/mm/maccess.c b/mm/maccess.c
index 518a25667323..8f0906180a94 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -13,9 +13,14 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
return true;
}
+/*
+ * The below only uses kmsan_check_memory() to ensure uninitialized kernel
+ * memory isn't leaked.
+ */
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
- __get_kernel_nofault(dst, src, type, err_label); \
+ __get_kernel_nofault(dst, src, type, err_label); \
+ kmsan_check_memory(src, sizeof(type)); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
@@ -49,7 +54,8 @@ EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
- __put_kernel_nofault(dst, src, type, err_label); \
+ __put_kernel_nofault(dst, src, type, err_label); \
+ instrument_write(dst, sizeof(type)); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
@@ -76,6 +82,7 @@ Efault:
pagefault_enable();
return -EFAULT;
}
+EXPORT_SYMBOL_GPL(copy_to_kernel_nofault);
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index ff139e57cca2..0ceae57da7da 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,6 +37,12 @@
#include "internal.h"
#include "swap.h"
+/*
+ * Maximum number of attempts we make to install guard pages before we give up
+ * and return -ERESTARTNOINTR to have userspace try again.
+ */
+#define MAX_MADVISE_GUARD_RETRIES 3
+
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
@@ -60,6 +66,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -1017,6 +1025,214 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
+{
+ vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
+
+ /*
+ * A user could lock after setting a guard range but that's fine, as
+ * they'd not be able to fault in. The issue arises when we try to zap
+ * existing locked VMAs. We don't want to do that.
+ */
+ if (!allow_locked)
+ disallowed |= VM_LOCKED;
+
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
+ return false;
+
+ return true;
+}
+
+static bool is_guard_pte_marker(pte_t ptent)
+{
+ return is_pte_marker(ptent) &&
+ is_guard_swp_entry(pte_to_swp_entry(ptent));
+}
+
+static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pud_trans_huge(pudval) || pud_devmap(pudval);
+}
+
+static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+}
+
+static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pteval = ptep_get(pte);
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* If there is already a guard page marker, we have nothing to do. */
+ if (is_guard_pte_marker(pteval)) {
+ (*nr_pages)++;
+
+ return 0;
+ }
+
+ /* If populated return >0 so we abort the operation + zap. */
+ return 1;
+}
+
+static int guard_install_set_pte(unsigned long addr, unsigned long next,
+ pte_t *ptep, struct mm_walk *walk)
+{
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* Simply install a PTE marker, this causes segfault on access. */
+ *ptep = make_pte_marker(PTE_MARKER_GUARD);
+ (*nr_pages)++;
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_install_walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_install(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ long err;
+ int i;
+
+ *prev = vma;
+ if (!is_valid_guard_vma(vma, /* allow_locked = */false))
+ return -EINVAL;
+
+ /*
+ * If we install guard markers, then the range is no longer
+ * empty from a page table perspective and therefore it's
+ * appropriate to have an anon_vma.
+ *
+ * This ensures that on fork, we copy page tables correctly.
+ */
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+
+ /*
+ * Optimistically try to install the guard marker pages first. If any
+ * non-guard pages are encountered, give up and zap the range before
+ * trying again.
+ *
+ * We try a few times before giving up and releasing back to userland to
+ * loop around, releasing locks in the process to avoid contention. This
+ * would only happen if there was a great many racing page faults.
+ *
+ * In most cases we should simply install the guard markers immediately
+ * with no zap or looping.
+ */
+ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
+ unsigned long nr_pages = 0;
+
+ /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
+ err = walk_page_range_mm(vma->vm_mm, start, end,
+ &guard_install_walk_ops, &nr_pages);
+ if (err < 0)
+ return err;
+
+ if (err == 0) {
+ unsigned long nr_expected_pages = PHYS_PFN(end - start);
+
+ VM_WARN_ON(nr_pages != nr_expected_pages);
+ return 0;
+ }
+
+ /*
+ * OK some of the range have non-guard pages mapped, zap
+ * them. This leaves existing guard pages in place.
+ */
+ zap_page_range_single(vma, start, end - start, NULL);
+ }
+
+ /*
+ * We were unable to install the guard pages due to being raced by page
+ * faults. This should not happen ordinarily. We return to userspace and
+ * immediately retry, relieving lock contention.
+ */
+ return restart_syscall();
+}
+
+static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (is_guard_pte_marker(ptent)) {
+ /* Simply clear the PTE marker. */
+ pte_clear_not_present_full(walk->mm, addr, pte, false);
+ update_mmu_cache(walk->vma, addr, pte);
+ }
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_remove_walk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ /*
+ * We're ok with removing guards in mlock()'d ranges, as this is a
+ * non-destructive action.
+ */
+ if (!is_valid_guard_vma(vma, /* allow_locked = */true))
+ return -EINVAL;
+
+ return walk_page_range(vma->vm_mm, start, end,
+ &guard_remove_walk_ops, NULL);
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1098,6 +1314,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(vma, prev, start, end);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1197,6 +1417,8 @@ madvise_behavior_valid(int behavior)
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
@@ -1208,7 +1430,8 @@ madvise_behavior_valid(int behavior)
}
}
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
@@ -1477,6 +1700,45 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return do_madvise(current->mm, start, len_in, behavior);
}
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+ int behavior)
+{
+ ssize_t ret = 0;
+ size_t total_len;
+
+ total_len = iov_iter_count(iter);
+
+ while (iov_iter_count(iter)) {
+ ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+ iter_iov_len(iter), behavior);
+ /*
+ * An madvise operation is attempting to restart the syscall,
+ * but we cannot proceed as it would not be correct to repeat
+ * the operation in aggregate, and would be surprising to the
+ * user.
+ *
+ * As we have already dropped locks, it is safe to just loop and
+ * try again. We check for fatal signals in case we need exit
+ * early anyway.
+ */
+ if (ret == -ERESTARTNOINTR) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ continue;
+ }
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iter_iov_len(iter));
+ }
+
+ ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+ return ret;
+}
+
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
@@ -1486,7 +1748,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct iov_iter iter;
struct task_struct *task;
struct mm_struct *mm;
- size_t total_len;
unsigned int f_flags;
if (flags != 0) {
@@ -1504,38 +1765,33 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto free_iov;
}
- if (!process_madvise_behavior_valid(behavior)) {
- ret = -EINVAL;
- goto release_task;
- }
-
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm)) {
- ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ if (IS_ERR(mm)) {
+ ret = PTR_ERR(mm);
goto release_task;
}
/*
+ * We need only perform this check if we are attempting to manipulate a
+ * remote process's address space.
+ */
+ if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_mm;
+ }
+
+ /*
* Require CAP_SYS_NICE for influencing process performance. Note that
- * only non-destructive hints are currently supported.
+ * only non-destructive hints are currently supported for remote
+ * processes.
*/
if (mm != current->mm && !capable(CAP_SYS_NICE)) {
ret = -EPERM;
goto release_mm;
}
- total_len = iov_iter_count(&iter);
-
- while (iov_iter_count(&iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
- iter_iov_len(&iter), behavior);
- if (ret < 0)
- break;
- iov_iter_advance(&iter, iter_iov_len(&iter));
- }
-
- ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ ret = vector_madvise(mm, &iter, behavior);
release_mm:
mmput(mm);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 86527d8fa7b9..a071fa43d479 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -40,31 +40,6 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-/* Stuffs for move charges at task migration. */
-/*
- * Types of charges to be moved.
- */
-#define MOVE_ANON 0x1ULL
-#define MOVE_FILE 0x2ULL
-#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
-
-/* "mc" and its members are protected by cgroup_mutex */
-static struct move_charge_struct {
- spinlock_t lock; /* for from, to */
- struct mm_struct *mm;
- struct mem_cgroup *from;
- struct mem_cgroup *to;
- unsigned long flags;
- unsigned long precharge;
- unsigned long moved_charge;
- unsigned long moved_swap;
- struct task_struct *moving_task; /* a task moving charges */
- wait_queue_head_t waitq; /* a waitq for other context */
-} mc = {
- .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
- .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -426,196 +401,22 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
return nr_reclaimed;
}
-/*
- * A routine for checking "mem" is under move_account() or not.
- *
- * Checking a cgroup is mc.from or mc.to or under hierarchy of
- * moving cgroups. This is for waiting at high-memory pressure
- * caused by "move".
- */
-static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *from;
- struct mem_cgroup *to;
- bool ret = false;
- /*
- * Unlike task_move routines, we access mc.to, mc.from not under
- * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
- */
- spin_lock(&mc.lock);
- from = mc.from;
- to = mc.to;
- if (!from)
- goto unlock;
-
- ret = mem_cgroup_is_descendant(from, memcg) ||
- mem_cgroup_is_descendant(to, memcg);
-unlock:
- spin_unlock(&mc.lock);
- return ret;
-}
-
-bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
-{
- if (mc.moving_task && current != mc.moving_task) {
- if (mem_cgroup_under_move(memcg)) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- return true;
- }
- }
- return false;
-}
-
-/**
- * folio_memcg_lock - Bind a folio to its memcg.
- * @folio: The folio.
- *
- * This function prevents unlocked LRU folios from being moved to
- * another cgroup.
- *
- * It ensures lifetime of the bound memcg. The caller is responsible
- * for the lifetime of the folio.
- */
-void folio_memcg_lock(struct folio *folio)
-{
- struct mem_cgroup *memcg;
- unsigned long flags;
-
- /*
- * The RCU lock is held throughout the transaction. The fast
- * path can get away without acquiring the memcg->move_lock
- * because page moving starts with an RCU grace period.
- */
- rcu_read_lock();
-
- if (mem_cgroup_disabled())
- return;
-again:
- memcg = folio_memcg(folio);
- if (unlikely(!memcg))
- return;
-
-#ifdef CONFIG_PROVE_LOCKING
- local_irq_save(flags);
- might_lock(&memcg->move_lock);
- local_irq_restore(flags);
-#endif
-
- if (atomic_read(&memcg->moving_account) <= 0)
- return;
-
- spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != folio_memcg(folio)) {
- spin_unlock_irqrestore(&memcg->move_lock, flags);
- goto again;
- }
-
- /*
- * When charge migration first begins, we can have multiple
- * critical sections holding the fast-path RCU lock and one
- * holding the slowpath move_lock. Track the task who has the
- * move_lock for folio_memcg_unlock().
- */
- memcg->move_lock_task = current;
- memcg->move_lock_flags = flags;
-}
-
-static void __folio_memcg_unlock(struct mem_cgroup *memcg)
-{
- if (memcg && memcg->move_lock_task == current) {
- unsigned long flags = memcg->move_lock_flags;
-
- memcg->move_lock_task = NULL;
- memcg->move_lock_flags = 0;
-
- spin_unlock_irqrestore(&memcg->move_lock, flags);
- }
-
- rcu_read_unlock();
-}
-
-/**
- * folio_memcg_unlock - Release the binding between a folio and its memcg.
- * @folio: The folio.
- *
- * This releases the binding created by folio_memcg_lock(). This does
- * not change the accounting of this folio to its memcg, but it does
- * permit others to change it.
- */
-void folio_memcg_unlock(struct folio *folio)
-{
- __folio_memcg_unlock(folio_memcg(folio));
-}
-
-#ifdef CONFIG_SWAP
-/**
- * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
- * @entry: swap entry to be moved
- * @from: mem_cgroup which the entry is moved from
- * @to: mem_cgroup which the entry is moved to
- *
- * It succeeds only when the swap_cgroup's record for this entry is the same
- * as the mem_cgroup's id of @from.
- *
- * Returns 0 on success, -EINVAL on failure.
- *
- * The caller must have charged to @to, IOW, called page_counter_charge() about
- * both res and memsw, and called css_get().
- */
-static int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to)
-{
- unsigned short old_id, new_id;
-
- old_id = mem_cgroup_id(from);
- new_id = mem_cgroup_id(to);
-
- if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mod_memcg_state(from, MEMCG_SWAP, -1);
- mod_memcg_state(to, MEMCG_SWAP, 1);
- return 0;
- }
- return -EINVAL;
-}
-#else
-static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to)
-{
- return -EINVAL;
-}
-#endif
-
static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_from_css(css)->move_charge_at_immigrate;
+ return 0;
}
#ifdef CONFIG_MMU
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
"Please report your usecase to linux-mm@kvack.org if you "
"depend on this functionality.\n");
- if (val & ~MOVE_MASK)
+ if (val != 0)
return -EINVAL;
-
- /*
- * No kind of locking is needed in here, because ->can_attach() will
- * check this value once in the beginning of the process, and then carry
- * on with stale data. This means that changes to this value will only
- * affect task migrations starting after the change.
- */
- memcg->move_charge_at_immigrate = val;
return 0;
}
#else
@@ -626,785 +427,6 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
}
#endif
-#ifdef CONFIG_MMU
-/* Handlers for move charge at task migration. */
-static int mem_cgroup_do_precharge(unsigned long count)
-{
- int ret;
-
- /* Try a single bulk charge without reclaim first, kswapd may wake */
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
- if (!ret) {
- mc.precharge += count;
- return ret;
- }
-
- /* Try charges one by one with reclaim, but do not retry */
- while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
- if (ret)
- return ret;
- mc.precharge++;
- cond_resched();
- }
- return 0;
-}
-
-union mc_target {
- struct folio *folio;
- swp_entry_t ent;
-};
-
-enum mc_target_type {
- MC_TARGET_NONE = 0,
- MC_TARGET_PAGE,
- MC_TARGET_SWAP,
- MC_TARGET_DEVICE,
-};
-
-static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent)
-{
- struct page *page = vm_normal_page(vma, addr, ptent);
-
- if (!page)
- return NULL;
- if (PageAnon(page)) {
- if (!(mc.flags & MOVE_ANON))
- return NULL;
- } else {
- if (!(mc.flags & MOVE_FILE))
- return NULL;
- }
- get_page(page);
-
- return page;
-}
-
-#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
-static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- pte_t ptent, swp_entry_t *entry)
-{
- struct page *page = NULL;
- swp_entry_t ent = pte_to_swp_entry(ptent);
-
- if (!(mc.flags & MOVE_ANON))
- return NULL;
-
- /*
- * Handle device private pages that are not accessible by the CPU, but
- * stored as special swap entries in the page table.
- */
- if (is_device_private_entry(ent)) {
- page = pfn_swap_entry_to_page(ent);
- if (!get_page_unless_zero(page))
- return NULL;
- return page;
- }
-
- if (non_swap_entry(ent))
- return NULL;
-
- /*
- * Because swap_cache_get_folio() updates some statistics counter,
- * we call find_get_page() with swapper_space directly.
- */
- page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
- entry->val = ent.val;
-
- return page;
-}
-#else
-static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- pte_t ptent, swp_entry_t *entry)
-{
- return NULL;
-}
-#endif
-
-static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent)
-{
- unsigned long index;
- struct folio *folio;
-
- if (!vma->vm_file) /* anonymous vma */
- return NULL;
- if (!(mc.flags & MOVE_FILE))
- return NULL;
-
- /* folio is moved even if it's not RSS of this task(page-faulted). */
- /* shmem/tmpfs may report page out on swap: account for that too. */
- index = linear_page_index(vma, addr);
- folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
- if (IS_ERR(folio))
- return NULL;
- return folio_file_page(folio, index);
-}
-
-static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
-static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
-
-/**
- * mem_cgroup_move_account - move account of the folio
- * @folio: The folio.
- * @compound: charge the page as compound or small page
- * @from: mem_cgroup which the folio is moved from.
- * @to: mem_cgroup which the folio is moved to. @from != @to.
- *
- * The folio must be locked and not on the LRU.
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct folio *folio,
- bool compound,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- struct lruvec *from_vec, *to_vec;
- struct pglist_data *pgdat;
- unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
- int nid, ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON(compound && !folio_test_large(folio));
-
- ret = -EINVAL;
- if (folio_memcg(folio) != from)
- goto out;
-
- pgdat = folio_pgdat(folio);
- from_vec = mem_cgroup_lruvec(from, pgdat);
- to_vec = mem_cgroup_lruvec(to, pgdat);
-
- folio_memcg_lock(folio);
-
- if (folio_test_anon(folio)) {
- if (folio_mapped(folio)) {
- __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
- if (folio_test_pmd_mappable(folio)) {
- __mod_lruvec_state(from_vec, NR_ANON_THPS,
- -nr_pages);
- __mod_lruvec_state(to_vec, NR_ANON_THPS,
- nr_pages);
- }
- }
- } else {
- __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
-
- if (folio_test_swapbacked(folio)) {
- __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
- __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
- }
-
- if (folio_mapped(folio)) {
- __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
- }
-
- if (folio_test_dirty(folio)) {
- struct address_space *mapping = folio_mapping(folio);
-
- if (mapping_can_writeback(mapping)) {
- __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
- -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
- nr_pages);
- }
- }
- }
-
-#ifdef CONFIG_SWAP
- if (folio_test_swapcache(folio)) {
- __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
- __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
- }
-#endif
- if (folio_test_writeback(folio)) {
- __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
- __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
- }
-
- /*
- * All state has been migrated, let's switch to the new memcg.
- *
- * It is safe to change page's memcg here because the page
- * is referenced, charged, isolated, and locked: we can't race
- * with (un)charging, migration, LRU putback, or anything else
- * that would rely on a stable page's memory cgroup.
- *
- * Note that folio_memcg_lock is a memcg lock, not a page lock,
- * to save space. As soon as we switch page's memory cgroup to a
- * new memcg that isn't locked, the above state can change
- * concurrently again. Make sure we're truly done with it.
- */
- smp_mb();
-
- css_get(&to->css);
- css_put(&from->css);
-
- /* Warning should never happen, so don't worry about refcount non-0 */
- WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
- folio->memcg_data = (unsigned long)to;
-
- __folio_memcg_unlock(from);
-
- ret = 0;
- nid = folio_nid(folio);
-
- local_irq_disable();
- memcg1_charge_statistics(to, nr_pages);
- memcg1_check_events(to, nid);
- memcg1_charge_statistics(from, -nr_pages);
- memcg1_check_events(from, nid);
- local_irq_enable();
-out:
- return ret;
-}
-
-/**
- * get_mctgt_type - get target type of moving charge
- * @vma: the vma the pte to be checked belongs
- * @addr: the address corresponding to the pte to be checked
- * @ptent: the pte to be checked
- * @target: the pointer the target page or swap ent will be stored(can be NULL)
- *
- * Context: Called with pte lock held.
- * Return:
- * * MC_TARGET_NONE - If the pte is not a target for move charge.
- * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- * move charge. If @target is not NULL, the folio is stored in target->folio
- * with extra refcnt taken (Caller should release it).
- * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
- * target for charge migration. If @target is not NULL, the entry is
- * stored in target->ent.
- * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
- * thus not on the lru. For now such page is charged like a regular page
- * would be as it is just special memory taking the place of a regular page.
- * See Documentations/vm/hmm.txt and include/linux/hmm.h
- */
-static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, union mc_target *target)
-{
- struct page *page = NULL;
- struct folio *folio;
- enum mc_target_type ret = MC_TARGET_NONE;
- swp_entry_t ent = { .val = 0 };
-
- if (pte_present(ptent))
- page = mc_handle_present_pte(vma, addr, ptent);
- else if (pte_none_mostly(ptent))
- /*
- * PTE markers should be treated as a none pte here, separated
- * from other swap handling below.
- */
- page = mc_handle_file_pte(vma, addr, ptent);
- else if (is_swap_pte(ptent))
- page = mc_handle_swap_pte(vma, ptent, &ent);
-
- if (page)
- folio = page_folio(page);
- if (target && page) {
- if (!folio_trylock(folio)) {
- folio_put(folio);
- return ret;
- }
- /*
- * page_mapped() must be stable during the move. This
- * pte is locked, so if it's present, the page cannot
- * become unmapped. If it isn't, we have only partial
- * control over the mapped state: the page lock will
- * prevent new faults against pagecache and swapcache,
- * so an unmapped page cannot become mapped. However,
- * if the page is already mapped elsewhere, it can
- * unmap, and there is nothing we can do about it.
- * Alas, skip moving the page in this case.
- */
- if (!pte_present(ptent) && page_mapped(page)) {
- folio_unlock(folio);
- folio_put(folio);
- return ret;
- }
- }
-
- if (!page && !ent.val)
- return ret;
- if (page) {
- /*
- * Do only loose check w/o serialization.
- * mem_cgroup_move_account() checks the page is valid or
- * not under LRU exclusion.
- */
- if (folio_memcg(folio) == mc.from) {
- ret = MC_TARGET_PAGE;
- if (folio_is_device_private(folio) ||
- folio_is_device_coherent(folio))
- ret = MC_TARGET_DEVICE;
- if (target)
- target->folio = folio;
- }
- if (!ret || !target) {
- if (target)
- folio_unlock(folio);
- folio_put(folio);
- }
- }
- /*
- * There is a swap entry and a page doesn't exist or isn't charged.
- * But we cannot move a tail-page in a THP.
- */
- if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
- mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
- ret = MC_TARGET_SWAP;
- if (target)
- target->ent = ent;
- }
- return ret;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * We don't consider PMD mapped swapping or file mapped pages because THP does
- * not support them for now.
- * Caller should make sure that pmd_trans_huge(pmd) is true.
- */
-static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, union mc_target *target)
-{
- struct page *page = NULL;
- struct folio *folio;
- enum mc_target_type ret = MC_TARGET_NONE;
-
- if (unlikely(is_swap_pmd(pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmd));
- return ret;
- }
- page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!page || !PageHead(page), page);
- folio = page_folio(page);
- if (!(mc.flags & MOVE_ANON))
- return ret;
- if (folio_memcg(folio) == mc.from) {
- ret = MC_TARGET_PAGE;
- if (target) {
- folio_get(folio);
- if (!folio_trylock(folio)) {
- folio_put(folio);
- return MC_TARGET_NONE;
- }
- target->folio = folio;
- }
- }
- return ret;
-}
-#else
-static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, union mc_target *target)
-{
- return MC_TARGET_NONE;
-}
-#endif
-
-static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- struct vm_area_struct *vma = walk->vma;
- pte_t *pte;
- spinlock_t *ptl;
-
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- /*
- * Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PRIVATE but
- * this might change.
- */
- if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
- mc.precharge += HPAGE_PMD_NR;
- spin_unlock(ptl);
- return 0;
- }
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
- for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
- mc.precharge++; /* increment precharge temporarily */
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
-
- return 0;
-}
-
-static const struct mm_walk_ops precharge_walk_ops = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
-
-static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
-{
- unsigned long precharge;
-
- mmap_read_lock(mm);
- walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
- mmap_read_unlock(mm);
-
- precharge = mc.precharge;
- mc.precharge = 0;
-
- return precharge;
-}
-
-static int mem_cgroup_precharge_mc(struct mm_struct *mm)
-{
- unsigned long precharge = mem_cgroup_count_precharge(mm);
-
- VM_BUG_ON(mc.moving_task);
- mc.moving_task = current;
- return mem_cgroup_do_precharge(precharge);
-}
-
-/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
-static void __mem_cgroup_clear_mc(void)
-{
- struct mem_cgroup *from = mc.from;
- struct mem_cgroup *to = mc.to;
-
- /* we must uncharge all the leftover precharges from mc.to */
- if (mc.precharge) {
- mem_cgroup_cancel_charge(mc.to, mc.precharge);
- mc.precharge = 0;
- }
- /*
- * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
- * we must uncharge here.
- */
- if (mc.moved_charge) {
- mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
- mc.moved_charge = 0;
- }
- /* we must fixup refcnts and charges */
- if (mc.moved_swap) {
- /* uncharge swap account from the old cgroup */
- if (!mem_cgroup_is_root(mc.from))
- page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
-
- mem_cgroup_id_put_many(mc.from, mc.moved_swap);
-
- /*
- * we charged both to->memory and to->memsw, so we
- * should uncharge to->memory.
- */
- if (!mem_cgroup_is_root(mc.to))
- page_counter_uncharge(&mc.to->memory, mc.moved_swap);
-
- mc.moved_swap = 0;
- }
- memcg1_oom_recover(from);
- memcg1_oom_recover(to);
- wake_up_all(&mc.waitq);
-}
-
-static void mem_cgroup_clear_mc(void)
-{
- struct mm_struct *mm = mc.mm;
-
- /*
- * we must clear moving_task before waking up waiters at the end of
- * task migration.
- */
- mc.moving_task = NULL;
- __mem_cgroup_clear_mc();
- spin_lock(&mc.lock);
- mc.from = NULL;
- mc.to = NULL;
- mc.mm = NULL;
- spin_unlock(&mc.lock);
-
- mmput(mm);
-}
-
-int memcg1_can_attach(struct cgroup_taskset *tset)
-{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
- struct mem_cgroup *from;
- struct task_struct *leader, *p;
- struct mm_struct *mm;
- unsigned long move_flags;
- int ret = 0;
-
- /* charge immigration isn't supported on the default hierarchy */
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return 0;
-
- /*
- * Multi-process migrations only happen on the default hierarchy
- * where charge immigration is not used. Perform charge
- * immigration if @tset contains a leader and whine if there are
- * multiple.
- */
- p = NULL;
- cgroup_taskset_for_each_leader(leader, css, tset) {
- WARN_ON_ONCE(p);
- p = leader;
- memcg = mem_cgroup_from_css(css);
- }
- if (!p)
- return 0;
-
- /*
- * We are now committed to this value whatever it is. Changes in this
- * tunable will only affect upcoming migrations, not the current one.
- * So we need to save it, and keep it going.
- */
- move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (!move_flags)
- return 0;
-
- from = mem_cgroup_from_task(p);
-
- VM_BUG_ON(from == memcg);
-
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
-
- spin_lock(&mc.lock);
- mc.mm = mm;
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
-
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- } else {
- mmput(mm);
- }
- return ret;
-}
-
-void memcg1_cancel_attach(struct cgroup_taskset *tset)
-{
- if (mc.to)
- mem_cgroup_clear_mc();
-}
-
-static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- int ret = 0;
- struct vm_area_struct *vma = walk->vma;
- pte_t *pte;
- spinlock_t *ptl;
- enum mc_target_type target_type;
- union mc_target target;
- struct folio *folio;
- bool tried_split_before = false;
-
-retry_pmd:
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(ptl);
- return 0;
- }
- target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
- if (target_type == MC_TARGET_PAGE) {
- folio = target.folio;
- /*
- * Deferred split queue locking depends on memcg,
- * and unqueue is unsafe unless folio refcount is 0:
- * split or skip if on the queue? first try to split.
- */
- if (!list_empty(&folio->_deferred_list)) {
- spin_unlock(ptl);
- if (!tried_split_before)
- split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (tried_split_before)
- return 0;
- tried_split_before = true;
- goto retry_pmd;
- }
- /*
- * So long as that pmd lock is held, the folio cannot
- * be racily added to the _deferred_list, because
- * __folio_remove_rmap() will find !partially_mapped.
- */
- if (folio_isolate_lru(folio)) {
- if (!mem_cgroup_move_account(folio, true,
- mc.from, mc.to)) {
- mc.precharge -= HPAGE_PMD_NR;
- mc.moved_charge += HPAGE_PMD_NR;
- }
- folio_putback_lru(folio);
- }
- folio_unlock(folio);
- folio_put(folio);
- } else if (target_type == MC_TARGET_DEVICE) {
- folio = target.folio;
- if (!mem_cgroup_move_account(folio, true,
- mc.from, mc.to)) {
- mc.precharge -= HPAGE_PMD_NR;
- mc.moved_charge += HPAGE_PMD_NR;
- }
- folio_unlock(folio);
- folio_put(folio);
- }
- spin_unlock(ptl);
- return 0;
- }
-
-retry:
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
- for (; addr != end; addr += PAGE_SIZE) {
- pte_t ptent = ptep_get(pte++);
- bool device = false;
- swp_entry_t ent;
-
- if (!mc.precharge)
- break;
-
- switch (get_mctgt_type(vma, addr, ptent, &target)) {
- case MC_TARGET_DEVICE:
- device = true;
- fallthrough;
- case MC_TARGET_PAGE:
- folio = target.folio;
- /*
- * We can have a part of the split pmd here. Moving it
- * can be done but it would be too convoluted so simply
- * ignore such a partial THP and keep it in original
- * memcg. There should be somebody mapping the head.
- */
- if (folio_test_large(folio))
- goto put;
- if (!device && !folio_isolate_lru(folio))
- goto put;
- if (!mem_cgroup_move_account(folio, false,
- mc.from, mc.to)) {
- mc.precharge--;
- /* we uncharge from mc.from later. */
- mc.moved_charge++;
- }
- if (!device)
- folio_putback_lru(folio);
-put: /* get_mctgt_type() gets & locks the page */
- folio_unlock(folio);
- folio_put(folio);
- break;
- case MC_TARGET_SWAP:
- ent = target.ent;
- if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
- mc.precharge--;
- mem_cgroup_id_get_many(mc.to, 1);
- /* we fixup other refcnts and charges later. */
- mc.moved_swap++;
- }
- break;
- default:
- break;
- }
- }
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
-
- if (addr != end) {
- /*
- * We have consumed all precharges we got in can_attach().
- * We try charge one by one, but don't do any additional
- * charges to mc.to if we have failed in charge once in attach()
- * phase.
- */
- ret = mem_cgroup_do_precharge(1);
- if (!ret)
- goto retry;
- }
-
- return ret;
-}
-
-static const struct mm_walk_ops charge_walk_ops = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
-
-static void mem_cgroup_move_charge(void)
-{
- lru_add_drain_all();
- /*
- * Signal folio_memcg_lock() to take the memcg's move_lock
- * while we're moving its pages to another memcg. Then wait
- * for already started RCU-only updates to finish.
- */
- atomic_inc(&mc.from->moving_account);
- synchronize_rcu();
-retry:
- if (unlikely(!mmap_read_trylock(mc.mm))) {
- /*
- * Someone who are holding the mmap_lock might be waiting in
- * waitq. So we cancel all extra charges, wake up all waiters,
- * and retry. Because we cancel precharges, we might not be able
- * to move enough charges, but moving charge is a best-effort
- * feature anyway, so it wouldn't be a big problem.
- */
- __mem_cgroup_clear_mc();
- cond_resched();
- goto retry;
- }
- /*
- * When we have consumed all precharges and failed in doing
- * additional charge, the page walk just aborts.
- */
- walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
- mmap_read_unlock(mc.mm);
- atomic_dec(&mc.from->moving_account);
-}
-
-void memcg1_move_task(void)
-{
- if (mc.to) {
- mem_cgroup_move_charge();
- mem_cgroup_clear_mc();
- }
-}
-
-#else /* !CONFIG_MMU */
-int memcg1_can_attach(struct cgroup_taskset *tset)
-{
- return 0;
-}
-void memcg1_cancel_attach(struct cgroup_taskset *tset)
-{
-}
-void memcg1_move_task(void)
-{
-}
-#endif
-
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
@@ -2072,7 +1094,6 @@ void memcg1_memcg_init(struct mem_cgroup *memcg)
{
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
- spin_lock_init(&memcg->move_lock);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
}
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index c0672e25bcdb..0e3b82951d91 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -80,12 +80,7 @@ static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg)
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
}
-bool memcg1_wait_acct_move(struct mem_cgroup *memcg);
-
struct cgroup_taskset;
-int memcg1_can_attach(struct cgroup_taskset *tset);
-void memcg1_cancel_attach(struct cgroup_taskset *tset);
-void memcg1_move_task(void);
void memcg1_css_offline(struct mem_cgroup *memcg);
/* for encoding cft->private value on file */
@@ -130,7 +125,6 @@ static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {}
static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {}
static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {}
-static inline bool memcg1_wait_acct_move(struct mem_cgroup *memcg) { return false; }
static inline void memcg1_css_offline(struct mem_cgroup *memcg) {}
static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { return true; }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53db98d2c4a1..7b3503d12aaf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -71,6 +71,10 @@
#include <linux/uaccess.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/memcg.h>
+#undef CREATE_TRACE_POINTS
+
#include <trace/events/vmscan.h>
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
@@ -114,6 +118,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
return container_of(vmpr, struct mem_cgroup, vmpressure);
}
+#define SEQ_BUF_SIZE SZ_4K
#define CURRENT_OBJCG_UPDATE_BIT 0
#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
@@ -310,6 +315,9 @@ static const unsigned int memcg_node_stat_items[] = {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+#ifdef CONFIG_HUGETLB_PAGE
+ NR_HUGETLB,
+#endif
};
static const unsigned int memcg_stat_items[] = {
@@ -418,6 +426,8 @@ static const unsigned int memcg_vm_event_stat[] = {
PGPGIN,
PGPGOUT,
#endif
+ PSWPIN,
+ PSWPOUT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
@@ -588,8 +598,16 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(struct mem_cgroup *memcg)
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
+ bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
+
+ trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ force, needs_flush);
+
+ if (!force && !needs_flush)
+ return;
+
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
@@ -613,8 +631,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
if (!memcg)
memcg = root_mem_cgroup;
- if (memcg_vmstats_needs_flush(memcg->vmstats))
- do_flush_stats(memcg);
+ __mem_cgroup_flush_stats(memcg, false);
}
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
@@ -630,7 +647,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
* in latency-sensitive paths is as cheap as possible.
*/
- do_flush_stats(root_mem_cgroup);
+ __mem_cgroup_flush_stats(root_mem_cgroup, true);
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -684,7 +701,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
return;
__this_cpu_add(memcg->vmstats_percpu->state[i], val);
- memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
+ val = memcg_state_val_in_pages(idx, val);
+ memcg_rstat_updated(memcg, val);
+ trace_mod_memcg_state(memcg, idx, val);
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -743,7 +762,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
- memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
+ val = memcg_state_val_in_pages(idx, val);
+ memcg_rstat_updated(memcg, val);
+ trace_mod_memcg_lruvec_state(memcg, idx, val);
memcg_stats_unlock();
}
@@ -834,6 +855,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
memcg_stats_lock();
__this_cpu_add(memcg->vmstats_percpu->events[i], count);
memcg_rstat_updated(memcg, count);
+ trace_count_memcg_events(memcg, idx, count);
memcg_stats_unlock();
}
@@ -1181,7 +1203,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
- * - folio_memcg_lock()
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held.
@@ -1203,7 +1224,6 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
- * - folio_memcg_lock()
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
@@ -1227,7 +1247,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
- * - folio_memcg_lock()
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
@@ -1350,6 +1369,9 @@ static const struct memory_stat memory_stats[] = {
{ "unevictable", NR_UNEVICTABLE },
{ "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
{ "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
+#ifdef CONFIG_HUGETLB_PAGE
+ { "hugetlb", NR_HUGETLB },
+#endif
/* The memory events */
{ "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
@@ -1445,6 +1467,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
+ !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ continue;
+#endif
size = memcg_page_state_output(memcg, memory_stats[i].idx);
seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
@@ -1520,7 +1547,7 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
- static char buf[PAGE_SIZE];
+ static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
lockdep_assert_held(&oom_lock);
@@ -1546,7 +1573,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_init(&s, buf, SEQ_BUF_SIZE);
memory_stat_format(memcg, &s);
seq_buf_do_printk(&s, KERN_INFO);
}
@@ -2234,12 +2261,6 @@ retry:
*/
if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
goto retry;
- /*
- * At task move, charge accounts can be doubly counted. So, it's
- * better to wait until the end of task_move if something is going on.
- */
- if (memcg1_wait_acct_move(mem_over_limit))
- goto retry;
if (nr_retries--)
goto retry;
@@ -2373,9 +2394,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
*
* - the page lock
* - LRU isolation
- * - folio_memcg_lock()
* - exclusive reference
- * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
@@ -3102,15 +3121,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!parent)
parent = root_mem_cgroup;
- memcg_reparent_objcgs(memcg, parent);
+ memcg_reparent_list_lrus(memcg, parent);
/*
- * After we have finished memcg_reparent_objcgs(), all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty.
- * The ordering is imposed by list_lru_node->lock taken by
- * memcg_reparent_list_lrus().
+ * Objcg's reparenting must be after list_lru's, make sure list_lru
+ * helpers won't use parent's list_lru until child is drained.
*/
- memcg_reparent_list_lrus(memcg, parent);
+ memcg_reparent_objcgs(memcg, parent);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -3733,68 +3750,90 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
-static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+struct aggregate_control {
+ /* pointer to the aggregated (CPU and subtree aggregated) counters */
+ long *aggregate;
+ /* pointer to the non-hierarchichal (CPU aggregated) counters */
+ long *local;
+ /* pointer to the pending child counters during tree propagation */
+ long *pending;
+ /* pointer to the parent's pending counters, could be NULL */
+ long *ppending;
+ /* pointer to the percpu counters to be aggregated */
+ long *cstat;
+ /* pointer to the percpu counters of the last aggregation*/
+ long *cstat_prev;
+ /* size of the above counters */
+ int size;
+};
+
+static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct memcg_vmstats_percpu *statc;
+ int i;
long delta, delta_cpu, v;
- int i, nid;
-
- statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) {
+ for (i = 0; i < ac->size; i++) {
/*
* Collect the aggregated propagation counts of groups
* below us. We're in a per-cpu loop here and this is
* a global counter, so the first cycle will get them.
*/
- delta = memcg->vmstats->state_pending[i];
+ delta = ac->pending[i];
if (delta)
- memcg->vmstats->state_pending[i] = 0;
+ ac->pending[i] = 0;
/* Add CPU changes on this level since the last flush */
delta_cpu = 0;
- v = READ_ONCE(statc->state[i]);
- if (v != statc->state_prev[i]) {
- delta_cpu = v - statc->state_prev[i];
+ v = READ_ONCE(ac->cstat[i]);
+ if (v != ac->cstat_prev[i]) {
+ delta_cpu = v - ac->cstat_prev[i];
delta += delta_cpu;
- statc->state_prev[i] = v;
+ ac->cstat_prev[i] = v;
}
/* Aggregate counts on this level and propagate upwards */
if (delta_cpu)
- memcg->vmstats->state_local[i] += delta_cpu;
+ ac->local[i] += delta_cpu;
if (delta) {
- memcg->vmstats->state[i] += delta;
- if (parent)
- parent->vmstats->state_pending[i] += delta;
+ ac->aggregate[i] += delta;
+ if (ac->ppending)
+ ac->ppending[i] += delta;
}
}
+}
- for (i = 0; i < NR_MEMCG_EVENTS; i++) {
- delta = memcg->vmstats->events_pending[i];
- if (delta)
- memcg->vmstats->events_pending[i] = 0;
-
- delta_cpu = 0;
- v = READ_ONCE(statc->events[i]);
- if (v != statc->events_prev[i]) {
- delta_cpu = v - statc->events_prev[i];
- delta += delta_cpu;
- statc->events_prev[i] = v;
- }
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct memcg_vmstats_percpu *statc;
+ struct aggregate_control ac;
+ int nid;
- if (delta_cpu)
- memcg->vmstats->events_local[i] += delta_cpu;
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- if (delta) {
- memcg->vmstats->events[i] += delta;
- if (parent)
- parent->vmstats->events_pending[i] += delta;
- }
- }
+ ac = (struct aggregate_control) {
+ .aggregate = memcg->vmstats->state,
+ .local = memcg->vmstats->state_local,
+ .pending = memcg->vmstats->state_pending,
+ .ppending = parent ? parent->vmstats->state_pending : NULL,
+ .cstat = statc->state,
+ .cstat_prev = statc->state_prev,
+ .size = MEMCG_VMSTAT_SIZE,
+ };
+ mem_cgroup_stat_aggregate(&ac);
+
+ ac = (struct aggregate_control) {
+ .aggregate = memcg->vmstats->events,
+ .local = memcg->vmstats->events_local,
+ .pending = memcg->vmstats->events_pending,
+ .ppending = parent ? parent->vmstats->events_pending : NULL,
+ .cstat = statc->events,
+ .cstat_prev = statc->events_prev,
+ .size = NR_MEMCG_EVENTS,
+ };
+ mem_cgroup_stat_aggregate(&ac);
for_each_node_state(nid, N_MEMORY) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
@@ -3807,28 +3846,17 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
- for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
- delta = lstats->state_pending[i];
- if (delta)
- lstats->state_pending[i] = 0;
-
- delta_cpu = 0;
- v = READ_ONCE(lstatc->state[i]);
- if (v != lstatc->state_prev[i]) {
- delta_cpu = v - lstatc->state_prev[i];
- delta += delta_cpu;
- lstatc->state_prev[i] = v;
- }
-
- if (delta_cpu)
- lstats->state_local[i] += delta_cpu;
+ ac = (struct aggregate_control) {
+ .aggregate = lstats->state,
+ .local = lstats->state_local,
+ .pending = lstats->state_pending,
+ .ppending = plstats ? plstats->state_pending : NULL,
+ .cstat = lstatc->state,
+ .cstat_prev = lstatc->state_prev,
+ .size = NR_MEMCG_NODE_STAT_ITEMS,
+ };
+ mem_cgroup_stat_aggregate(&ac);
- if (delta) {
- lstats->state[i] += delta;
- if (plstats)
- plstats->state_pending[i] += delta;
- }
- }
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
@@ -4189,12 +4217,12 @@ static int memory_events_local_show(struct seq_file *m, void *v)
int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL);
struct seq_buf s;
if (!buf)
return -ENOMEM;
- seq_buf_init(&s, buf, PAGE_SIZE);
+ seq_buf_init(&s, buf, SEQ_BUF_SIZE);
memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
@@ -4433,9 +4461,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
.exit = mem_cgroup_exit,
.dfl_cftypes = memory_files,
#ifdef CONFIG_MEMCG_V1
- .can_attach = memcg1_can_attach,
- .cancel_attach = memcg1_cancel_attach,
- .post_attach = memcg1_move_task,
.legacy_cftypes = mem_cgroup_legacy_files,
#endif
.early_init = 0,
@@ -5277,11 +5302,8 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
break;
}
- /*
- * mem_cgroup_flush_stats() ignores small changes. Use
- * do_flush_stats() directly to get accurate stats for charging.
- */
- do_flush_stats(memcg);
+ /* Force flush to get accurate stats for charging */
+ __mem_cgroup_flush_stats(memcg, true);
pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 96ce31e5a203..a7b8ccd29b6f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -100,7 +100,7 @@ static ssize_t _name##_show(struct device *dev, \
{ \
struct memory_failure_stats *mf_stats = \
&NODE_DATA(dev->id)->mf_stats; \
- return sprintf(buf, "%lu\n", mf_stats->_name); \
+ return sysfs_emit(buf, "%lu\n", mf_stats->_name); \
} \
static DEVICE_ATTR_RO(_name)
@@ -445,7 +445,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
-static void __add_to_kill(struct task_struct *tsk, struct page *p,
+static void __add_to_kill(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -461,7 +461,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
else
- tk->size_shift = page_shift(compound_head(p));
+ tk->size_shift = folio_shift(page_folio(p));
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -486,7 +486,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
list_add_tail(&tk->nd, to_kill);
}
-static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
+static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -509,7 +509,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
return false;
}
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -606,8 +606,9 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
/*
* Collect processes when the error hit an anonymous page.
*/
-static void collect_procs_anon(struct folio *folio, struct page *page,
- struct list_head *to_kill, int force_early)
+static void collect_procs_anon(const struct folio *folio,
+ const struct page *page, struct list_head *to_kill,
+ int force_early)
{
struct task_struct *tsk;
struct anon_vma *av;
@@ -617,7 +618,7 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
if (av == NULL) /* Not actually mapped anymore */
return;
- pgoff = page_to_pgoff(page);
+ pgoff = page_pgoff(folio, page);
rcu_read_lock();
for_each_process(tsk) {
struct vm_area_struct *vma;
@@ -643,8 +644,9 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
/*
* Collect processes when the error hit a file mapped page.
*/
-static void collect_procs_file(struct folio *folio, struct page *page,
- struct list_head *to_kill, int force_early)
+static void collect_procs_file(const struct folio *folio,
+ const struct page *page, struct list_head *to_kill,
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -653,7 +655,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
i_mmap_lock_read(mapping);
rcu_read_lock();
- pgoff = page_to_pgoff(page);
+ pgoff = page_pgoff(folio, page);
for_each_process(tsk) {
struct task_struct *t = task_early_kill(tsk, force_early);
unsigned long addr;
@@ -671,7 +673,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
*/
if (vma->vm_mm != t->mm)
continue;
- addr = page_address_in_vma(page, vma);
+ addr = page_address_in_vma(folio, page, vma);
add_to_kill_anon_file(t, page, vma, to_kill, addr);
}
}
@@ -680,7 +682,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
}
#ifdef CONFIG_FS_DAX
-static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
+static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p,
struct vm_area_struct *vma,
struct list_head *to_kill, pgoff_t pgoff)
{
@@ -691,7 +693,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
/*
* Collect processes when the error hit a fsdax page.
*/
-static void collect_procs_fsdax(struct page *page,
+static void collect_procs_fsdax(const struct page *page,
struct address_space *mapping, pgoff_t pgoff,
struct list_head *to_kill, bool pre_remove)
{
@@ -725,7 +727,7 @@ static void collect_procs_fsdax(struct page *page,
/*
* Collect the processes who have the corrupted page mapped to kill.
*/
-static void collect_procs(struct folio *folio, struct page *page,
+static void collect_procs(const struct folio *folio, const struct page *page,
struct list_head *tokill, int force_early)
{
if (!folio->mapping)
diff --git a/mm/memory.c b/mm/memory.c
index bdf77a3ec47b..75c2dfd04f72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,4 +1,3 @@
-
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
@@ -44,7 +43,6 @@
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
@@ -1061,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
if (need_zero)
new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
else
- new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
- addr, false);
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
if (!new_folio)
return NULL;
@@ -1085,6 +1082,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pmd_t dummy_pmdval;
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, max_nr, ret = 0;
@@ -1110,7 +1108,15 @@ again:
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+
+ /*
+ * We already hold the exclusive mmap_lock, the copy_pte_range() and
+ * retract_page_tables() are using vma->anon_vma to be exclusive, so
+ * the PTE page is stable, and there is no need to get pmdval and do
+ * pmd_same() check.
+ */
+ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
+ &src_ptl);
if (!src_pte) {
pte_unmap_unlock(dst_pte, dst_ptl);
/* ret == 0 */
@@ -1449,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
return !folio_test_anon(folio);
}
-static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+static inline bool zap_drop_markers(struct zap_details *details)
{
if (!details)
return false;
@@ -1470,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (vma_is_anonymous(vma))
return;
- if (zap_drop_file_uffd_wp(details))
+ if (zap_drop_markers(details))
return;
for (;;) {
@@ -1665,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* drop the marker if explicitly requested.
*/
if (!vma_is_anonymous(vma) &&
- !zap_drop_file_uffd_wp(details))
+ !zap_drop_markers(details))
+ continue;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
continue;
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
@@ -3997,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (marker & PTE_MARKER_POISONED)
return VM_FAULT_HWPOISON;
+ /* Hitting a guard page is always a fatal condition. */
+ if (marker & PTE_MARKER_GUARD)
+ return VM_FAULT_SIGSEGV;
+
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -4010,8 +4028,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
struct folio *folio;
swp_entry_t entry;
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
- vmf->address, false);
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
if (!folio)
return NULL;
@@ -4167,7 +4184,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
gfp = vma_thp_gfp_mask(vma);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ folio = vma_alloc_folio(gfp, order, vma, addr);
if (folio) {
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
@@ -4706,7 +4723,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
gfp = vma_thp_gfp_mask(vma);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ folio = vma_alloc_folio(gfp, order, vma, addr);
if (folio) {
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
@@ -4714,7 +4731,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
goto next;
}
folio_throttle_swaprate(folio, gfp);
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation
+ * (__GFP_ZERO not used), folio_zero_user() is used
+ * to make sure that the page corresponding to the
+ * faulting address will be hot in the cache after
+ * zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, vmf->address);
return folio;
}
next:
@@ -5743,14 +5768,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->pte = NULL;
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
+ pmd_t dummy_pmdval;
+
/*
* A regular pmd is established and it can't morph into a huge
* pmd by anon khugepaged, since that takes mmap_lock in write
* mode; but shmem or file collapse to THP could still morph
* it into a huge pmd: just retry later if so.
+ *
+ * Use the maywrite version to indicate that vmf->pte may be
+ * modified, but since we will use pte_same() to detect the
+ * change of the !pte_none() entry, there is no need to recheck
+ * the pmdval. Here we chooes to pass a dummy variable instead
+ * of NULL, which helps new user think about why this place is
+ * special.
*/
- vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, &vmf->ptl);
+ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &dummy_pmdval,
+ &vmf->ptl);
if (unlikely(!vmf->pte))
return 0;
vmf->orig_pte = ptep_get_lockless(vmf->pte);
@@ -6365,7 +6400,7 @@ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
struct address_space *mapping = file ? file->f_mapping : NULL;
if (mapping)
- lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
+ lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
lockdep_is_held(&vma->vm_mm->mmap_lock));
else
lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 621ae1015106..c43b4e7fb298 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = PHYSMEM_END;
+ const u64 max_phys = DIRECT_MAP_PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b646fab3e45e..bb37cd1a51d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -8,7 +8,7 @@
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
- * Support four policies per VMA and per process:
+ * Support six policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
@@ -1367,7 +1367,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_entry_is_head(folio, &pagelist, lru)) {
vma_iter_init(&vmi, mm, start);
for_each_vma_range(vmi, vma, end) {
- addr = page_address_in_vma(
+ addr = page_address_in_vma(folio,
folio_page(folio, 0), vma);
if (addr != -EFAULT)
break;
@@ -2290,7 +2290,6 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* @order: Order of the folio.
* @vma: Pointer to VMA.
* @addr: Virtual address of the allocation. Must be inside @vma.
- * @hugepage: Unused (was: For hugepages try only preferred node if possible).
*
* Allocate a folio for a specific address in @vma, using the appropriate
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
@@ -2301,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* Return: The folio on success or NULL if allocation fails.
*/
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, bool hugepage)
+ unsigned long addr)
{
struct mempolicy *pol;
pgoff_t ilx;
diff --git a/mm/migrate.c b/mm/migrate.c
index 47a8d102ae7a..2ce6b4b814df 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -695,6 +695,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
if (folio_test_idle(folio))
folio_set_idle(newfolio);
+ folio_migrate_refs(newfolio, folio);
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
@@ -1732,7 +1733,7 @@ static int migrate_pages_batch(struct list_head *from,
list_for_each_entry_safe(folio, folio2, from, lru) {
is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4ba5607aaf19..1c205b0a86ed 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -83,8 +83,7 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = BITS_PER_LONG;
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
+ width = shift - NR_NON_PAGEFLAG_BITS;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
@@ -2639,7 +2638,7 @@ void __init mm_core_init(void)
BUILD_BUG_ON(MAX_ZONELISTS > 2);
build_all_zonelists(NULL);
page_alloc_init_cpuhp();
-
+ alloc_tag_sec_init();
/*
* page_ext requires contiguous pages,
* bigger than MAX_PAGE_ORDER unless SPARSEMEM.
diff --git a/mm/mmap.c b/mm/mmap.c
index 79d541f1502b..386429f7db5a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -577,22 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-/*
- * We account for memory if it's a private writeable mapping,
- * not hugepages and VM_NORESERVE wasn't set.
- */
-static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
-{
- /*
- * hugetlb has its own accounting separate from the core VM
- * VM_HUGETLB may not be set yet so we cannot check for that flag.
- */
- if (file && is_file_hugepages(file))
- return false;
-
- return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
-}
-
/**
* unmapped_area() - Find an area between the low_limit and the high_limit with
* the correct alignment and offset, all from @info. Note: current->mm is used
@@ -776,6 +760,8 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.start_gap = stack_guard_placement(vm_flags);
+ if (filp && is_file_hugepages(filp))
+ info.align_mask = huge_page_mask_align(filp);
return vm_unmapped_area(&info);
}
@@ -826,6 +812,8 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.start_gap = stack_guard_placement(vm_flags);
+ if (filp && is_file_hugepages(filp))
+ info.align_mask = huge_page_mask_align(filp);
addr = vm_unmapped_area(&info);
/*
@@ -1051,6 +1039,8 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
+ mmap_assert_write_locked(mm);
+
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
@@ -1086,11 +1076,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
- /*
- * vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_lock in read mode. We need the
- * anon_vma lock to serialize against concurrent expand_stacks.
- */
+ /* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
@@ -1104,16 +1090,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
- /*
- * We only hold a shared mmap_lock lock here, so
- * we need to protect against concurrent vma
- * expansions. anon_vma_lock_write() doesn't
- * help here, as we don't guarantee that all
- * growable vmas in a mm share the same root
- * anon vma. So, we reuse mm->page_table_lock
- * to guard against concurrent vma expansions.
- */
- spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
@@ -1122,7 +1098,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
- spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -1149,6 +1124,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
+ mmap_assert_write_locked(mm);
+
address &= PAGE_MASK;
if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
@@ -1178,11 +1155,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
- /*
- * vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_lock in read mode. We need the
- * anon_vma lock to serialize against concurrent expand_stacks.
- */
+ /* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
@@ -1196,16 +1169,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
- /*
- * We only hold a shared mmap_lock lock here, so
- * we need to protect against concurrent vma
- * expansions. anon_vma_lock_write() doesn't
- * help here, as we don't guarantee that all
- * growable vmas in a mm share the same root
- * anon vma. So, we reuse mm->page_table_lock
- * to guard against concurrent vma expansions.
- */
- spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
@@ -1215,7 +1178,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
- spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -1358,224 +1320,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-static unsigned long __mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = NULL;
- pgoff_t pglen = PHYS_PFN(len);
- unsigned long charged = 0;
- struct vma_munmap_struct vms;
- struct ma_state mas_detach;
- struct maple_tree mt_detach;
- unsigned long end = addr + len;
- int error;
- VMA_ITERATOR(vmi, mm, addr);
- VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
-
- vmg.file = file;
- /* Find the first overlapping VMA */
- vma = vma_find(&vmi, end);
- init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
- if (vma) {
- mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_on_stack(mt_detach);
- mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
- /* Prepare to unmap any existing mapping in the area */
- error = vms_gather_munmap_vmas(&vms, &mas_detach);
- if (error)
- goto gather_failed;
-
- vmg.next = vms.next;
- vmg.prev = vms.prev;
- vma = NULL;
- } else {
- vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
- }
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) {
- error = -ENOMEM;
- goto abort_munmap;
- }
-
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = pglen;
- charged -= vms.nr_accounted;
- if (charged) {
- error = security_vm_enough_memory_mm(mm, charged);
- if (error)
- goto abort_munmap;
- }
-
- vms.nr_accounted = 0;
- vm_flags |= VM_ACCOUNT;
- vmg.flags = vm_flags;
- }
-
- /*
- * clear PTEs while the vma is still in the tree so that rmap
- * cannot race with the freeing later in the truncate scenario.
- * This is also needed for mmap_file(), which is why vm_ops
- * close function is called.
- */
- vms_clean_up_area(&vms, &mas_detach);
- vma = vma_merge_new_range(&vmg);
- if (vma)
- goto expanded;
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma_iter_config(&vmi, addr, end);
- vma_set_range(vma, addr, end, pgoff);
- vm_flags_init(vma, vm_flags);
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
-
- if (vma_iter_prealloc(&vmi, vma)) {
- error = -ENOMEM;
- goto free_vma;
- }
-
- if (file) {
- vma->vm_file = get_file(file);
- error = mmap_file(file, vma);
- if (error)
- goto unmap_and_free_file_vma;
-
- /* Drivers cannot alter the address of the VMA. */
- WARN_ON_ONCE(addr != vma->vm_start);
- /*
- * Drivers should not permit writability when previously it was
- * disallowed.
- */
- VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
- !(vm_flags & VM_MAYWRITE) &&
- (vma->vm_flags & VM_MAYWRITE));
-
- vma_iter_config(&vmi, addr, end);
- /*
- * If vm_flags changed after mmap_file(), we should try merge
- * vma again as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
- struct vm_area_struct *merge;
-
- vmg.flags = vma->vm_flags;
- /* If this fails, state is reset ready for a reattempt. */
- merge = vma_merge_new_range(&vmg);
-
- if (merge) {
- /*
- * ->mmap() can change vma->vm_file and fput
- * the original file. So fput the vma->vm_file
- * here or we would add an extra fput for file
- * and cause general protection fault
- * ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto file_expanded;
- }
- vma_iter_config(&vmi, addr, end);
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_iter_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
-#ifdef CONFIG_SPARC64
- /* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(vm_flags));
-#endif
-
- /* Lock the VMA since it is modified after insertion into VMA tree */
- vma_start_write(vma);
- vma_iter_store(&vmi, vma);
- mm->map_count++;
- vma_link_file(vma);
-
- /*
- * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
-file_expanded:
- file = vma->vm_file;
- ksm_add_vma(vma);
-expanded:
- perf_event_mmap(vma);
-
- /* Unmap any existing mapping in the area */
- vms_complete_munmap_vmas(&vms, &mas_detach);
-
- vm_stat_account(mm, vm_flags, pglen);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vm_flags_clear(vma, VM_LOCKED_MASK);
- else
- mm->locked_vm += pglen;
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vm_flags_set(vma, VM_SOFTDIRTY);
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_file_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- vma_iter_set(&vmi, vma->vm_end);
- /* Undo any partial mapping done by a device driver. */
- unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
-free_iter_vma:
- vma_iter_free(&vmi);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
-
-abort_munmap:
- vms_abort_munmap_vmas(&vms, &mas_detach);
-gather_failed:
- return error;
-}
-
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 368b840e7508..f186d57df2c6 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -19,43 +19,23 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
#ifdef CONFIG_MEMCG
-static atomic_t reg_refcount;
-
/*
* Size of the buffer for memcg path names. Ignoring stack trace support,
* trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
*/
#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-int trace_mmap_lock_reg(void)
-{
- atomic_inc(&reg_refcount);
- return 0;
-}
-
-void trace_mmap_lock_unreg(void)
-{
- atomic_dec(&reg_refcount);
-}
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- char buf[MEMCG_PATH_BUF_SIZE]; \
- get_mm_memcg_path(mm, buf, sizeof(buf)); \
- trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ if (trace_mmap_lock_##type##_enabled()) { \
+ char buf[MEMCG_PATH_BUF_SIZE]; \
+ get_mm_memcg_path(mm, buf, sizeof(buf)); \
+ trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
+ } \
} while (0)
#else /* !CONFIG_MEMCG */
-int trace_mmap_lock_reg(void)
-{
- return 0;
-}
-
-void trace_mmap_lock_unreg(void)
-{
-}
-
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
@@ -65,16 +45,13 @@ void trace_mmap_lock_unreg(void)
#ifdef CONFIG_MEMCG
/*
* Write the given mm_struct's memcg path to a buffer. If the path cannot be
- * determined or the trace event is being unregistered, empty string is written.
+ * determined, empty string is written.
*/
static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
{
struct mem_cgroup *memcg;
buf[0] = '\0';
- /* No need to get path if no trace event is registered. */
- if (!atomic_read(&reg_refcount))
- return;
memcg = get_mem_cgroup_from_mm(mm);
if (memcg == NULL)
return;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6f450af3252e..516b1d847e2c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -236,9 +236,11 @@ static long change_pte_range(struct mmu_gather *tlb,
} else if (is_pte_marker_entry(entry)) {
/*
* Ignore error swap entries unconditionally,
- * because any access should sigbus anyway.
+ * because any access should sigbus/sigsegv
+ * anyway.
*/
- if (is_poisoned_swp_entry(entry))
+ if (is_poisoned_swp_entry(entry) ||
+ is_guard_swp_entry(entry))
continue;
/*
* If this is uffd-wp pte marker and we'd like
diff --git a/mm/mremap.c b/mm/mremap.c
index dee98ff2bbd6..60473413836b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -140,6 +140,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
{
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
+ pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
unsigned long len = old_end - old_addr;
@@ -175,7 +176,15 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
err = -EAGAIN;
goto out;
}
- new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+ /*
+ * Now new_pte is none, so hpage_collapse_scan_file() path can not find
+ * this by traversing file->f_mapping, so there is no concurrency with
+ * retract_page_tables(). In addition, we already hold the exclusive
+ * mmap_lock, so this new_pte page is stable, so there is no need to get
+ * pmdval and do pmd_same() check.
+ */
+ new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
+ &new_ptl);
if (!new_pte) {
pte_unmap_unlock(old_pte, old_ptl);
err = -EAGAIN;
@@ -817,17 +826,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
-static struct vm_area_struct *vma_to_resize(unsigned long addr,
+/*
+ * resize_is_valid() - Ensure the vma can be resized to the new length at the give
+ * address.
+ *
+ * @vma: The vma to resize
+ * @addr: The old address
+ * @old_len: The current size
+ * @new_len: The desired size
+ * @flags: The vma flags
+ *
+ * Return 0 on success, error otherwise.
+ */
+static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
unsigned long old_len, unsigned long new_len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned long pgoff;
- vma = vma_lookup(mm, addr);
- if (!vma)
- return ERR_PTR(-EFAULT);
-
/*
* !old_len is a special case where an attempt is made to 'duplicate'
* a mapping. This makes no sense for private mappings as it will
@@ -838,39 +854,53 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
*/
if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
if ((flags & MREMAP_DONTUNMAP) &&
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (new_len == old_len)
- return vma;
+ return 0;
/* Need to be careful about a growing mapping */
pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- return vma;
+ return 0;
}
+/*
+ * mremap_to() - remap a vma to a new location
+ * @addr: The old address
+ * @old_len: The old size
+ * @new_addr: The target address
+ * @new_len: The new size
+ * @locked: If the returned vma is locked (VM_LOCKED)
+ * @flags: the mremap flags
+ * @uf: The mremap userfaultfd context
+ * @uf_unmap_early: The userfaultfd unmap early context
+ * @uf_unmap: The userfaultfd unmap context
+ *
+ * Returns: The new address of the vma or an error.
+ */
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
unsigned long flags, struct vm_userfaultfd_ctx *uf,
@@ -879,18 +909,18 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long ret = -EINVAL;
+ unsigned long ret;
unsigned long map_flags = 0;
if (offset_in_page(new_addr))
- goto out;
+ return -EINVAL;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- goto out;
+ return -EINVAL;
/* Ensure the old/new locations do not overlap */
if (addr + old_len > new_addr && new_addr + new_len > addr)
- goto out;
+ return -EINVAL;
/*
* move_vma() need us to stay 4 maps below the threshold, otherwise
@@ -917,27 +947,28 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
*/
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
- goto out;
+ return ret;
}
if (old_len > new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
if (ret)
- goto out;
+ return ret;
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, flags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto out;
- }
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ return -EFAULT;
+
+ ret = resize_is_valid(vma, addr, old_len, new_len, flags);
+ if (ret)
+ return ret;
/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
if (flags & MREMAP_DONTUNMAP &&
!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
if (flags & MREMAP_FIXED)
@@ -950,17 +981,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
if (IS_ERR_VALUE(ret))
- goto out;
+ return ret;
/* We got a new mapping */
if (!(flags & MREMAP_FIXED))
new_addr = ret;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
- uf_unmap);
-
-out:
- return ret;
+ return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags,
+ uf, uf_unmap);
}
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
@@ -1105,11 +1133,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, flags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ ret = resize_is_valid(vma, addr, old_len, new_len, flags);
+ if (ret)
goto out;
- }
/* old_len exactly to the end of the area..
*/
diff --git a/mm/mseal.c b/mm/mseal.c
index ece977bd21e1..81d6e980e8a9 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
case MADV_REMOVE:
case MADV_DONTFORK:
case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
return true;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4d7a0004df2c..1c485beb0b93 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -24,7 +24,6 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/swap.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 72a5d8836425..fdb89ce85fff 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -917,7 +917,9 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
unsigned long thresh)
{
struct wb_domain *dom = dtc_dom(dtc);
+ struct bdi_writeback *wb = dtc->wb;
u64 wb_thresh;
+ u64 wb_max_thresh;
unsigned long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
@@ -931,11 +933,28 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_thresh *= numerator;
wb_thresh = div64_ul(wb_thresh, denominator);
- wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
+ wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE))
- wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
+
+ /*
+ * With strictlimit flag, the wb_thresh is treated as
+ * a hard limit in balance_dirty_pages() and wb_position_ratio().
+ * It's possible that wb_thresh is close to zero, not because
+ * the device is slow, but because it has been inactive.
+ * To prevent occasional writes from being blocked, we raise wb_thresh.
+ */
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
+ u64 wb_scale_thresh = 0;
+
+ if (limit > dtc->dirty)
+ wb_scale_thresh = (limit - dtc->dirty) / 100;
+ wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4));
+ }
return wb_thresh;
}
@@ -2724,8 +2743,6 @@ EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold folio_memcg_lock().
- *
* NOTE: This relies on being atomic wrt interrupts.
*/
static void folio_account_dirtied(struct folio *folio,
@@ -2758,7 +2775,6 @@ static void folio_account_dirtied(struct folio *folio,
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold folio_memcg_lock().
*/
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
@@ -2776,9 +2792,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold folio_memcg_lock(). It is the caller's
- * responsibility to prevent the folio from being truncated while
- * this function is in progress, although it may have been truncated
+ * It is the caller's responsibility to prevent the folio from being truncated
+ * while this function is in progress, although it may have been truncated
* before this function is called. Most callers have the folio locked.
* A few have the folio blocked from truncation through other means (e.g.
* zap_vma_pages() has it mapped and is holding the page table lock).
@@ -2822,14 +2837,10 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
*/
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- folio_memcg_lock(folio);
- if (folio_test_set_dirty(folio)) {
- folio_memcg_unlock(folio);
+ if (folio_test_set_dirty(folio))
return false;
- }
__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
- folio_memcg_unlock(folio);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2956,14 +2967,12 @@ void __folio_cancel_dirty(struct folio *folio)
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
- folio_memcg_lock(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (folio_test_clear_dirty(folio))
folio_account_cleaned(folio, wb);
unlocked_inode_to_wb_end(inode, &cookie);
- folio_memcg_unlock(folio);
} else {
folio_clear_dirty(folio);
}
@@ -3074,7 +3083,6 @@ bool __folio_end_writeback(struct folio *folio)
struct address_space *mapping = folio_mapping(folio);
bool ret;
- folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -3105,7 +3113,6 @@ bool __folio_end_writeback(struct folio *folio)
lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
node_stat_mod_folio(folio, NR_WRITTEN, nr);
- folio_memcg_unlock(folio);
return ret;
}
@@ -3118,7 +3125,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
- folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct inode *inode = mapping->host;
@@ -3159,7 +3165,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
- folio_memcg_unlock(folio);
access_ret = arch_make_folio_accessible(folio);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 119a3a52f96e..1cb4b8c8886d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5377,7 +5377,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- page_group_by_mobility_disabled ? "off" : "on",
+ str_off_on(page_group_by_mobility_disabled),
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
diff --git a/mm/page_io.c b/mm/page_io.c
index 01749b99fb54..4b4ea8e49cf6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -277,6 +277,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
swap_zeromap_folio_clear(folio);
}
if (zswap_store(folio)) {
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
folio_unlock(folio);
return 0;
}
@@ -296,8 +297,9 @@ static inline void count_swpout_vm_event(struct folio *folio)
count_memcg_folio_events(folio, THP_SWPOUT, 1);
count_vm_event(THP_SWPOUT);
}
- count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
#endif
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
+ count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
@@ -493,6 +495,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
for (p = 0; p < sio->pages; p++) {
struct folio *folio = page_folio(sio->bvec[p].bv_page);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
folio_mark_uptodate(folio);
folio_unlock(folio);
}
@@ -586,6 +590,8 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
* attempt to access it in the page fault retry time check.
*/
get_task_struct(current);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
count_vm_events(PSWPIN, folio_nr_pages(folio));
submit_bio_wait(&bio);
__end_swap_bio_read(&bio);
@@ -601,6 +607,8 @@ static void swap_read_folio_bdev_async(struct folio *folio,
bio->bi_iter.bi_sector = swap_folio_sector(folio);
bio->bi_end_io = end_swap_bio_read;
bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
count_vm_events(PSWPIN, folio_nr_pages(folio));
submit_bio(bio);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae5cc42aa208..81839a9e74f1 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,7 +13,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
+ spinlock_t **ptlp)
{
pte_t ptent;
@@ -25,6 +26,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
return !!pvmw->pte;
}
+again:
/*
* It is important to return the ptl corresponding to pte,
* in case *pvmw->pmd changes underneath us; so we need to
@@ -32,8 +34,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* proceeds to loop over next ptes, and finds a match later.
* Though, in most cases, page lock already protects this.
*/
- pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
- pvmw->address, ptlp);
+ pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, pmdvalp, ptlp);
if (!pvmw->pte)
return false;
@@ -67,8 +69,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
} else if (!pte_present(ptent)) {
return false;
}
+ spin_lock(*ptlp);
+ if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, *ptlp);
+ goto again;
+ }
pvmw->ptl = *ptlp;
- spin_lock(pvmw->ptl);
+
return true;
}
@@ -278,7 +285,7 @@ restart:
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!map_pte(pvmw, &ptl)) {
+ if (!map_pte(pvmw, &pmde, &ptl)) {
if (!pvmw->pte)
goto restart;
goto next_pte;
@@ -305,8 +312,13 @@ next_pte:
} while (pte_none(ptep_get(pvmw->pte)));
if (!pvmw->ptl) {
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
+ pte_unmap_unlock(pvmw->pte, ptl);
+ pvmw->pte = NULL;
+ goto restart;
+ }
pvmw->ptl = ptl;
- spin_lock(pvmw->ptl);
}
goto this_pte;
} while (pvmw->address < end);
@@ -325,10 +337,10 @@ next_pte:
* outside the VMA or not present, returns -EFAULT.
* Only valid for normal file or anonymous VMAs.
*/
-unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_mapped_in_vma(const struct page *page,
+ struct vm_area_struct *vma)
{
- struct folio *folio = page_folio(page);
- pgoff_t pgoff = folio->index + folio_page_idx(folio, page);
+ const struct folio *folio = page_folio(page);
struct page_vma_mapped_walk pvmw = {
.pfn = page_to_pfn(page),
.nr_pages = 1,
@@ -336,7 +348,7 @@ unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.flags = PVMW_SYNC,
};
- pvmw.address = vma_address(vma, pgoff, 1);
+ pvmw.address = vma_address(vma, page_pgoff(folio, page), 1);
if (pvmw.address == -EFAULT)
goto out;
if (!page_vma_mapped_walk(&pvmw))
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5f9f01532e67..e478777c86e1 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,9 +3,14 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
+#include <linux/mmu_context.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
/*
* We want to know the real level where a entry is located ignoring any
* folding of levels which may be happening. For example if p4d is folded then
@@ -29,9 +34,23 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
int err = 0;
for (;;) {
- err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
- if (err)
- break;
+ if (ops->install_pte && pte_none(ptep_get(pte))) {
+ pte_t new_pte;
+
+ err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte,
+ walk);
+ if (err)
+ break;
+
+ set_pte_at(walk->mm, addr, pte, new_pte);
+ /* Non-present before, so for arches that need it. */
+ if (!WARN_ON_ONCE(walk->no_vma))
+ update_mmu_cache(walk->vma, addr, pte);
+ } else {
+ err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ if (err)
+ break;
+ }
if (addr >= end - PAGE_SIZE)
break;
addr += PAGE_SIZE;
@@ -81,6 +100,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd_t *pmd;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
int depth = real_depth(3);
@@ -89,11 +110,14 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
again:
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __pte_alloc(walk->mm, pmd);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
walk->action = ACTION_SUBTREE;
@@ -109,18 +133,25 @@ again:
if (walk->action == ACTION_AGAIN)
goto again;
-
- /*
- * Check this here so we only break down trans_huge
- * pages when we _need_ to
- */
- if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
- walk->action == ACTION_CONTINUE ||
- !(ops->pte_entry))
+ if (walk->action == ACTION_CONTINUE)
continue;
+ if (!has_handler) { /* No handlers for lower page tables. */
+ if (!has_install)
+ continue; /* Nothing to do. */
+ /*
+ * We are ONLY installing, so avoid unnecessarily
+ * splitting a present huge page.
+ */
+ if (pmd_present(*pmd) &&
+ (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ continue;
+ }
+
if (walk->vma)
split_huge_pmd(walk->vma, pmd, addr);
+ else if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+ continue; /* Nothing to do. */
err = walk_pte_range(pmd, addr, next, walk);
if (err)
@@ -140,6 +171,8 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
pud_t *pud;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->pmd_entry || ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
int depth = real_depth(2);
@@ -148,11 +181,14 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
again:
next = pud_addr_end(addr, end);
if (pud_none(*pud)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __pmd_alloc(walk->mm, pud, addr);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
walk->action = ACTION_SUBTREE;
@@ -164,14 +200,26 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (walk->action == ACTION_AGAIN)
goto again;
-
- if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
- walk->action == ACTION_CONTINUE ||
- !(ops->pmd_entry || ops->pte_entry))
+ if (walk->action == ACTION_CONTINUE)
continue;
+ if (!has_handler) { /* No handlers for lower page tables. */
+ if (!has_install)
+ continue; /* Nothing to do. */
+ /*
+ * We are ONLY installing, so avoid unnecessarily
+ * splitting a present huge page.
+ */
+ if (pud_present(*pud) &&
+ (pud_trans_huge(*pud) || pud_devmap(*pud)))
+ continue;
+ }
+
if (walk->vma)
split_huge_pud(walk->vma, pud, addr);
+ else if (pud_leaf(*pud) || !pud_present(*pud))
+ continue; /* Nothing to do. */
+
if (pud_none(*pud))
goto again;
@@ -189,6 +237,8 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
p4d_t *p4d;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
int depth = real_depth(1);
@@ -196,18 +246,21 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __pud_alloc(walk->mm, p4d, addr);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
if (ops->p4d_entry) {
err = ops->p4d_entry(p4d, addr, next, walk);
if (err)
break;
}
- if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ if (has_handler || has_install)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -222,6 +275,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
+ bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+ ops->pte_entry;
+ bool has_install = ops->install_pte;
int err = 0;
if (walk->pgd)
@@ -231,18 +287,21 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
- if (ops->pte_hole)
+ if (has_install)
+ err = __p4d_alloc(walk->mm, pgd, addr);
+ else if (ops->pte_hole)
err = ops->pte_hole(addr, next, 0, walk);
if (err)
break;
- continue;
+ if (!has_install)
+ continue;
}
if (ops->pgd_entry) {
err = ops->pgd_entry(pgd, addr, next, walk);
if (err)
break;
}
- if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ if (has_handler || has_install)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -334,6 +393,11 @@ static int __walk_page_range(unsigned long start, unsigned long end,
int err = 0;
struct vm_area_struct *vma = walk->vma;
const struct mm_walk_ops *ops = walk->ops;
+ bool is_hugetlb = is_vm_hugetlb_page(vma);
+
+ /* We do not support hugetlb PTE installation. */
+ if (ops->install_pte && is_hugetlb)
+ return -EINVAL;
if (ops->pre_vma) {
err = ops->pre_vma(start, end, walk);
@@ -341,7 +405,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
return err;
}
- if (is_vm_hugetlb_page(vma)) {
+ if (is_hugetlb) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
@@ -380,47 +444,14 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
#endif
}
-/**
- * walk_page_range - walk page table with caller specific callbacks
- * @mm: mm_struct representing the target process of page table walk
- * @start: start address of the virtual address range
- * @end: end address of the virtual address range
- * @ops: operation to call during the walk
- * @private: private data for callbacks' usage
- *
- * Recursively walk the page table tree of the process represented by @mm
- * within the virtual address range [@start, @end). During walking, we can do
- * some caller-specific works for each entry, by setting up pmd_entry(),
- * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
- * callbacks, the associated entries/pages are just ignored.
- * The return values of these callbacks are commonly defined like below:
- *
- * - 0 : succeeded to handle the current entry, and if you don't reach the
- * end address yet, continue to walk.
- * - >0 : succeeded to handle the current entry, and return to the caller
- * with caller specific value.
- * - <0 : failed to handle the current entry, and return to the caller
- * with error code.
- *
- * Before starting to walk page table, some callers want to check whether
- * they really want to walk over the current vma, typically by checking
- * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
- * purpose.
- *
- * If operations need to be staged before and committed after a vma is walked,
- * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
- * since it is intended to handle commit-type operations, can't return any
- * errors.
- *
- * struct mm_walk keeps current values of some common data like vma and pmd,
- * which are useful for the access from callbacks. If you want to pass some
- * caller-specific data to callbacks, @private should be helpful.
+/*
+ * See the comment for walk_page_range(), this performs the heavy lifting of the
+ * operation, only sets no restrictions on how the walk proceeds.
*
- * Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
- * because these function traverse vma list and/or access to vma's data.
+ * We usually restrict the ability to install PTEs, but this functionality is
+ * available to internal memory management code and provided in mm/internal.h.
*/
-int walk_page_range(struct mm_struct *mm, unsigned long start,
+int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private)
{
@@ -479,6 +510,80 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
+/*
+ * Determine if the walk operations specified are permitted to be used for a
+ * page table walk.
+ *
+ * This check is performed on all functions which are parameterised by walk
+ * operations and exposed in include/linux/pagewalk.h.
+ *
+ * Internal memory management code can use the walk_page_range_mm() function to
+ * be able to use all page walking operations.
+ */
+static bool check_ops_valid(const struct mm_walk_ops *ops)
+{
+ /*
+ * The installation of PTEs is solely under the control of memory
+ * management logic and subject to many subtle locking, security and
+ * cache considerations so we cannot permit other users to do so, and
+ * certainly not for exported symbols.
+ */
+ if (ops->install_pte)
+ return false;
+
+ return true;
+}
+
+/**
+ * walk_page_range - walk page table with caller specific callbacks
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
+ *
+ * Recursively walk the page table tree of the process represented by @mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *
+ * - 0 : succeeded to handle the current entry, and if you don't reach the
+ * end address yet, continue to walk.
+ * - >0 : succeeded to handle the current entry, and return to the caller
+ * with caller specific value.
+ * - <0 : failed to handle the current entry, and return to the caller
+ * with error code.
+ *
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
+ * purpose.
+ *
+ * If operations need to be staged before and committed after a vma is walked,
+ * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
+ * since it is intended to handle commit-type operations, can't return any
+ * errors.
+ *
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @private should be helpful.
+ *
+ * Locking:
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
+ * because these function traverse vma list and/or access to vma's data.
+ */
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
+ return walk_page_range_mm(mm, start, end, ops, private);
+}
+
/**
* walk_page_range_novma - walk a range of pagetables not backed by a vma
* @mm: mm_struct representing the target process of page table walk
@@ -494,7 +599,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
* walking the kernel pages tables or page tables for firmware.
*
* Note: Be careful to walk the kernel pages tables, the caller may be need to
- * take other effective approache (mmap lock may be insufficient) to prevent
+ * take other effective approaches (mmap lock may be insufficient) to prevent
* the intermediate kernel page tables belonging to the specified address range
* from being freed (e.g. memory hot-remove).
*/
@@ -513,6 +618,8 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
if (start >= end || !walk.mm)
return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
/*
* 1) For walking the user virtual address space:
@@ -556,6 +663,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
process_vma_walk_lock(vma, ops->walk_lock);
@@ -574,6 +683,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
process_vma_walk_lock(vma, ops->walk_lock);
@@ -623,6 +734,9 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
unsigned long start_addr, end_addr;
int err = 0;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
lockdep_assert_held(&mapping->i_mmap_rwsem);
vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
first_index + nr - 1) {
diff --git a/mm/percpu.c b/mm/percpu.c
index da21680ff294..d8dd31a2e407 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -253,13 +253,13 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
- page->index = (unsigned long)pcpu;
+ page->private = (unsigned long)pcpu;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
- return (struct pcpu_chunk *)page->index;
+ return (struct pcpu_chunk *)page->private;
}
static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
@@ -1864,6 +1864,10 @@ restart:
area_found:
pcpu_stats_area_alloc(chunk, size);
+
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+ pcpu_schedule_balance_work();
+
spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate if not all pages are already there */
@@ -1891,9 +1895,6 @@ area_found:
mutex_unlock(&pcpu_alloc_mutex);
}
- if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
- pcpu_schedule_balance_work();
-
/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a78a4adf711a..5297dcc38c37 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -305,8 +305,8 @@ nomap:
return NULL;
}
-pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp)
+pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
{
pmd_t pmdval;
pte_t *pte;
@@ -317,6 +317,19 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}
+pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, pmd_t *pmdvalp,
+ spinlock_t **ptlp)
+{
+ pte_t *pte;
+
+ VM_WARN_ON_ONCE(!pmdvalp);
+ pte = __pte_offset_map(pmd, addr, pmdvalp);
+ if (likely(pte))
+ *ptlp = pte_lockptr(mm, pmdvalp);
+ return pte;
+}
+
/*
* pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
* __pte_offset_map_lock() below, is usually called with the pmd pointer for
@@ -347,14 +360,28 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
* and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
* afterwards.
*
- * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
+ * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
* but when successful, it also outputs a pointer to the spinlock in ptlp - as
* pte_offset_map_lock() does, but in this case without locking it. This helps
* the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
- * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
- * pointer for the page table that it returns. In principle, the caller should
- * recheck *pmd once the lock is taken; in practice, no callsite needs that -
- * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
+ * pointer for the page table that it returns. Even after grabbing the spinlock,
+ * we might be looking either at a page table that is still mapped or one that
+ * was unmapped and is about to get freed. But for R/O access this is sufficient.
+ * So it is only applicable for read-only cases where any modification operations
+ * to the page table are not allowed even if the corresponding spinlock is held
+ * afterwards.
+ *
+ * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
+ * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
+ * It is applicable for may-write cases where any modification operations to the
+ * page table may happen after the corresponding spinlock is held afterwards.
+ * But the users should make sure the page table is stable like checking pte_same()
+ * or checking pmd_same() by using the output pmdval before performing the write
+ * operations.
+ *
+ * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
+ * be read-only/read-write protected.
*
* Note that free_pgtables(), used after unmapping detached vmas, or when
* exiting the whole mm, does not take page table lock before freeing a page
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b308e96cd05a..656d3e88755b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -201,8 +201,8 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
}
mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
- if (!mm || IS_ERR(mm)) {
- rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ if (IS_ERR(mm)) {
+ rc = PTR_ERR(mm);
/*
* Explicitly map EACCES to EPERM as EPERM is a more
* appropriate error code for process_vw_readv/writev
diff --git a/mm/readahead.c b/mm/readahead.c
index 9a807727d809..8f1cf599b572 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -206,9 +206,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
- unsigned long ra_folio_index, index = readahead_index(ractl);
+ unsigned long index = readahead_index(ractl);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- unsigned long mark, i = 0;
+ unsigned long mark = ULONG_MAX, i = 0;
unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);
/*
@@ -232,9 +232,14 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* index that only has lookahead or "async_region" to set the
* readahead flag.
*/
- ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
- min_nrpages);
- mark = ra_folio_index - index;
+ if (lookahead_size <= nr_to_read) {
+ unsigned long ra_folio_index;
+
+ ra_folio_index = round_up(readahead_index(ractl) +
+ nr_to_read - lookahead_size,
+ min_nrpages);
+ mark = ra_folio_index - index;
+ }
nr_to_read += readahead_index(ractl) - index;
ractl->_index = index;
diff --git a/mm/rmap.c b/mm/rmap.c
index 73d5998677d4..c6c4d4ea29a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,6 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in block_dirty_folio)
- * folio_lock_memcg move_lock (in block_dirty_folio)
* i_pages lock (widely used)
* lruvec->lru_lock (in folio_lruvec_lock_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
@@ -497,7 +496,7 @@ void __init anon_vma_init(void)
* concurrently without folio lock protection). See folio_lock_anon_vma_read()
* which has already covered that, and comment above remap_pages().
*/
-struct anon_vma *folio_get_anon_vma(struct folio *folio)
+struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
@@ -541,7 +540,7 @@ out:
* reference like with folio_get_anon_vma() and then block on the mutex
* on !rwc->try_lock case.
*/
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
@@ -768,15 +767,27 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
-/*
- * At what user virtual address is page expected in vma?
- * Caller should check the page is actually part of the vma.
+/**
+ * page_address_in_vma - The virtual address of a page in this VMA.
+ * @folio: The folio containing the page.
+ * @page: The page within the folio.
+ * @vma: The VMA we need to know the address in.
+ *
+ * Calculates the user virtual address of this page in the specified VMA.
+ * It is the caller's responsibililty to check the page is actually
+ * within the VMA. There may not currently be a PTE pointing at this
+ * page, but if a page fault occurs at this address, this is the page
+ * which will be accessed.
+ *
+ * Context: Caller should hold a reference to the folio. Caller should
+ * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
+ * VMA from being altered.
+ *
+ * Return: The virtual address corresponding to this page in the VMA.
*/
-unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_address_in_vma(const struct folio *folio,
+ const struct page *page, const struct vm_area_struct *vma)
{
- struct folio *folio = page_folio(page);
- pgoff_t pgoff;
-
if (folio_test_anon(folio)) {
struct anon_vma *page__anon_vma = folio_anon_vma(folio);
/*
@@ -792,9 +803,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
}
- /* The !page__anon_vma above handles KSM folios */
- pgoff = folio->index + folio_page_idx(folio, page);
- return vma_address(vma, pgoff, 1);
+ /* KSM folios don't reach here because of the !page__anon_vma check */
+ return vma_address(vma, page_pgoff(folio, page), 1);
}
/*
@@ -1261,8 +1271,9 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
-static void __page_check_anon_rmap(struct folio *folio, struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+static void __page_check_anon_rmap(const struct folio *folio,
+ const struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
{
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
@@ -1277,7 +1288,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page,
*/
VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
folio);
- VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+ VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
page);
}
@@ -2559,7 +2570,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(root);
}
-static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
diff --git a/mm/shmem.c b/mm/shmem.c
index c7881e16f4be..ccb9629a0f70 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -141,6 +141,7 @@ static unsigned long huge_shmem_orders_always __read_mostly;
static unsigned long huge_shmem_orders_madvise __read_mostly;
static unsigned long huge_shmem_orders_inherit __read_mostly;
static unsigned long huge_shmem_orders_within_size __read_mostly;
+static bool shmem_orders_configured __initdata;
#endif
#ifdef CONFIG_TMPFS
@@ -553,17 +554,15 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- struct vm_area_struct *vma,
- unsigned long vm_flags)
+static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ unsigned long vm_flags)
{
- struct mm_struct *mm = vma ? vma->vm_mm : NULL;
loff_t i_size;
- if (!S_ISREG(inode->i_mode))
+ if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
return false;
- if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
+ if (!S_ISREG(inode->i_mode))
return false;
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
@@ -581,7 +580,7 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
return true;
fallthrough;
case SHMEM_HUGE_ADVISE:
- if (mm && (vm_flags & VM_HUGEPAGE))
+ if (vm_flags & VM_HUGEPAGE)
return true;
fallthrough;
default:
@@ -589,35 +588,39 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
}
}
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- struct vm_area_struct *vma, unsigned long vm_flags)
+static int shmem_parse_huge(const char *str)
{
- if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
- return false;
+ int huge;
- return __shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vma, vm_flags);
-}
+ if (!str)
+ return -EINVAL;
-#if defined(CONFIG_SYSFS)
-static int shmem_parse_huge(const char *str)
-{
if (!strcmp(str, "never"))
- return SHMEM_HUGE_NEVER;
- if (!strcmp(str, "always"))
- return SHMEM_HUGE_ALWAYS;
- if (!strcmp(str, "within_size"))
- return SHMEM_HUGE_WITHIN_SIZE;
- if (!strcmp(str, "advise"))
- return SHMEM_HUGE_ADVISE;
- if (!strcmp(str, "deny"))
- return SHMEM_HUGE_DENY;
- if (!strcmp(str, "force"))
- return SHMEM_HUGE_FORCE;
- return -EINVAL;
+ huge = SHMEM_HUGE_NEVER;
+ else if (!strcmp(str, "always"))
+ huge = SHMEM_HUGE_ALWAYS;
+ else if (!strcmp(str, "within_size"))
+ huge = SHMEM_HUGE_WITHIN_SIZE;
+ else if (!strcmp(str, "advise"))
+ huge = SHMEM_HUGE_ADVISE;
+ else if (!strcmp(str, "deny"))
+ huge = SHMEM_HUGE_DENY;
+ else if (!strcmp(str, "force"))
+ huge = SHMEM_HUGE_FORCE;
+ else
+ return -EINVAL;
+
+ if (!has_transparent_hugepage() &&
+ huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
+ return -EINVAL;
+
+ /* Do not override huge allocation policy with non-PMD sized mTHP */
+ if (huge == SHMEM_HUGE_FORCE &&
+ huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ return huge;
}
-#endif
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
@@ -777,8 +780,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- struct vm_area_struct *vma, unsigned long vm_flags)
+ loff_t write_end, bool shmem_huge_force,
+ unsigned long vm_flags)
{
return false;
}
@@ -1173,7 +1176,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1658,6 +1661,23 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool shmem_hpage_pmd_enabled(void)
+{
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
+ return true;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
+ return true;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
+ return true;
+ if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
+ shmem_huge != SHMEM_HUGE_NEVER)
+ return true;
+
+ return false;
+}
+
unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
loff_t write_end, bool shmem_huge_force)
@@ -1673,7 +1693,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
return 0;
global_huge = shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vma, vm_flags);
+ shmem_huge_force, vm_flags);
if (!vma || !vma_is_anon_shmem(vma)) {
/*
* For tmpfs, we now only support PMD sized THP if huge page
@@ -2879,7 +2899,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
cache_no_acl(inode);
if (sbinfo->noswap)
mapping_set_unevictable(inode->i_mapping);
- mapping_set_large_folios(inode->i_mapping);
+
+ /* Don't consider 'deny' for emergencies and 'force' for testing */
+ if (sbinfo->huge)
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
@@ -3140,27 +3163,19 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
unsigned long offset;
int error = 0;
ssize_t retval = 0;
- loff_t *ppos = &iocb->ki_pos;
-
- index = *ppos >> PAGE_SHIFT;
- offset = *ppos & ~PAGE_MASK;
for (;;) {
struct folio *folio = NULL;
struct page *page = NULL;
- pgoff_t end_index;
unsigned long nr, ret;
- loff_t i_size = i_size_read(inode);
+ loff_t end_offset, i_size = i_size_read(inode);
+ bool fallback_page_copy = false;
+ size_t fsize;
- end_index = i_size >> PAGE_SHIFT;
- if (index > end_index)
+ if (unlikely(iocb->ki_pos >= i_size))
break;
- if (index == end_index) {
- nr = i_size & ~PAGE_MASK;
- if (nr <= offset)
- break;
- }
+ index = iocb->ki_pos >> PAGE_SHIFT;
error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
@@ -3176,24 +3191,29 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
error = -EIO;
break;
}
+
+ if (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))
+ fallback_page_copy = true;
}
/*
* We must evaluate after, since reads (unlike writes)
* are called without i_rwsem protection against truncate
*/
- nr = PAGE_SIZE;
i_size = i_size_read(inode);
- end_index = i_size >> PAGE_SHIFT;
- if (index == end_index) {
- nr = i_size & ~PAGE_MASK;
- if (nr <= offset) {
- if (folio)
- folio_put(folio);
- break;
- }
+ if (unlikely(iocb->ki_pos >= i_size)) {
+ if (folio)
+ folio_put(folio);
+ break;
}
- nr -= offset;
+ end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
+ if (folio && likely(!fallback_page_copy))
+ fsize = folio_size(folio);
+ else
+ fsize = PAGE_SIZE;
+ offset = iocb->ki_pos & (fsize - 1);
+ nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
if (folio) {
/*
@@ -3201,10 +3221,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ if (mapping_writably_mapped(mapping)) {
+ if (likely(!fallback_page_copy))
+ flush_dcache_folio(folio);
+ else
+ flush_dcache_page(page);
+ }
+
/*
- * Mark the page accessed if we read the beginning.
+ * Mark the folio accessed if we read the beginning.
*/
if (!offset)
folio_mark_accessed(folio);
@@ -3212,9 +3237,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
- ret = copy_page_to_iter(page, offset, nr, to);
+ if (likely(!fallback_page_copy))
+ ret = copy_folio_to_iter(folio, offset, nr, to);
+ else
+ ret = copy_page_to_iter(page, offset, nr, to);
folio_put(folio);
-
} else if (user_backed_iter(to)) {
/*
* Copy to user tends to be so well optimized, but
@@ -3232,9 +3259,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
retval += ret;
- offset += ret;
- index += offset >> PAGE_SHIFT;
- offset &= ~PAGE_MASK;
+ iocb->ki_pos += ret;
if (!iov_iter_count(to))
break;
@@ -3245,7 +3270,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
cond_resched();
}
- *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
file_accessed(file);
return retval ? retval : error;
}
@@ -3334,11 +3358,16 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
len = min_t(size_t, len, npages * PAGE_SIZE);
do {
+ bool fallback_page_splice = false;
+ struct page *page = NULL;
+ pgoff_t index;
+ size_t size;
+
if (*ppos >= i_size_read(inode))
break;
- error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
- SGP_READ);
+ index = *ppos >> PAGE_SHIFT;
+ error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
@@ -3347,12 +3376,15 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (folio) {
folio_unlock(folio);
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) &&
- folio_test_has_hwpoisoned(folio))) {
+ page = folio_file_page(folio, index);
+ if (PageHWPoison(page)) {
error = -EIO;
break;
}
+
+ if (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))
+ fallback_page_splice = true;
}
/*
@@ -3366,7 +3398,17 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
isize = i_size_read(inode);
if (unlikely(*ppos >= isize))
break;
- part = min_t(loff_t, isize - *ppos, len);
+ /*
+ * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
+ * pages.
+ */
+ size = len;
+ if (unlikely(fallback_page_splice)) {
+ size_t offset = *ppos & ~PAGE_MASK;
+
+ size = umin(size, PAGE_SIZE - offset);
+ }
+ part = min_t(loff_t, isize - *ppos, size);
if (folio) {
/*
@@ -3374,8 +3416,12 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (mapping_writably_mapped(mapping))
- flush_dcache_folio(folio);
+ if (mapping_writably_mapped(mapping)) {
+ if (likely(!fallback_page_splice))
+ flush_dcache_folio(folio);
+ else
+ flush_dcache_page(page);
+ }
folio_mark_accessed(folio);
/*
* Ok, we have the page, and it's up-to-date, so we can
@@ -5224,7 +5270,8 @@ void __init shmem_init(void)
* Default to setting PMD-sized THP to inherit the global setting and
* disable all other multi-size THPs.
*/
- huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
+ if (!shmem_orders_configured)
+ huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
#endif
return;
@@ -5267,7 +5314,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
char tmp[16];
- int huge;
+ int huge, err;
if (count + 1 > sizeof(tmp))
return -EINVAL;
@@ -5278,20 +5325,14 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
huge = shmem_parse_huge(tmp);
if (huge == -EINVAL)
- return -EINVAL;
- if (!has_transparent_hugepage() &&
- huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
- return -EINVAL;
-
- /* Do not override huge allocation policy with non-PMD sized mTHP */
- if (huge == SHMEM_HUGE_FORCE &&
- huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
- return -EINVAL;
+ return huge;
shmem_huge = huge;
if (shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
- return count;
+
+ err = start_stop_khugepaged();
+ return err ? err : count;
}
struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
@@ -5368,6 +5409,12 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
ret = -EINVAL;
}
+ if (ret > 0) {
+ int err = start_stop_khugepaged();
+
+ if (err)
+ ret = err;
+ }
return ret;
}
@@ -5375,6 +5422,126 @@ struct kobj_attribute thpsize_shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
+static int __init setup_transparent_hugepage_shmem(char *str)
+{
+ int huge;
+
+ huge = shmem_parse_huge(str);
+ if (huge == -EINVAL) {
+ pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
+ return huge;
+ }
+
+ shmem_huge = huge;
+ return 1;
+}
+__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
+
+static char str_dup[PAGE_SIZE] __initdata;
+static int __init setup_thp_shmem(char *str)
+{
+ char *token, *range, *policy, *subtoken;
+ unsigned long always, inherit, madvise, within_size;
+ char *start_size, *end_size;
+ int start, end, nr;
+ char *p;
+
+ if (!str || strlen(str) + 1 > PAGE_SIZE)
+ goto err;
+ strscpy(str_dup, str);
+
+ always = huge_shmem_orders_always;
+ inherit = huge_shmem_orders_inherit;
+ madvise = huge_shmem_orders_madvise;
+ within_size = huge_shmem_orders_within_size;
+ p = str_dup;
+ while ((token = strsep(&p, ";")) != NULL) {
+ range = strsep(&token, ":");
+ policy = token;
+
+ if (!policy)
+ goto err;
+
+ while ((subtoken = strsep(&range, ",")) != NULL) {
+ if (strchr(subtoken, '-')) {
+ start_size = strsep(&subtoken, "-");
+ end_size = subtoken;
+
+ start = get_order_from_str(start_size,
+ THP_ORDERS_ALL_FILE_DEFAULT);
+ end = get_order_from_str(end_size,
+ THP_ORDERS_ALL_FILE_DEFAULT);
+ } else {
+ start_size = end_size = subtoken;
+ start = end = get_order_from_str(subtoken,
+ THP_ORDERS_ALL_FILE_DEFAULT);
+ }
+
+ if (start == -EINVAL) {
+ pr_err("invalid size %s in thp_shmem boot parameter\n",
+ start_size);
+ goto err;
+ }
+
+ if (end == -EINVAL) {
+ pr_err("invalid size %s in thp_shmem boot parameter\n",
+ end_size);
+ goto err;
+ }
+
+ if (start < 0 || end < 0 || start > end)
+ goto err;
+
+ nr = end - start + 1;
+ if (!strcmp(policy, "always")) {
+ bitmap_set(&always, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else if (!strcmp(policy, "advise")) {
+ bitmap_set(&madvise, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else if (!strcmp(policy, "inherit")) {
+ bitmap_set(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else if (!strcmp(policy, "within_size")) {
+ bitmap_set(&within_size, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ } else if (!strcmp(policy, "never")) {
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&within_size, start, nr);
+ } else {
+ pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
+ goto err;
+ }
+ }
+ }
+
+ huge_shmem_orders_always = always;
+ huge_shmem_orders_madvise = madvise;
+ huge_shmem_orders_inherit = inherit;
+ huge_shmem_orders_within_size = within_size;
+ shmem_orders_configured = true;
+ return 1;
+
+err:
+ pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
+ return 0;
+}
+__setup("thp_shmem=", setup_thp_shmem);
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
#else /* !CONFIG_SHMEM */
/*
diff --git a/mm/show_mem.c b/mm/show_mem.c
index ec885a398fa0..43afb56abbd3 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -285,8 +285,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
- "yes" : "no");
+ str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES));
}
for_each_populated_zone(zone) {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c0388b2e959d..cec67c5f37d8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -184,10 +184,6 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
return p;
}
-void __weak __meminit kernel_pte_init(void *addr)
-{
-}
-
pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
@@ -201,10 +197,6 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-void __weak __meminit pmd_init(void *addr)
-{
-}
-
pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -218,10 +210,6 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
return pud;
}
-void __weak __meminit pud_init(void *addr)
-{
-}
-
p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
diff --git a/mm/sparse.c b/mm/sparse.c
index dc38539f8560..13b6624d3562 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
+ unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
@@ -720,19 +720,19 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
- unsigned long magic, nr_pages;
+ unsigned long type, nr_pages;
struct page *page = virt_to_page(memmap);
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
- magic = page->index;
+ type = bootmem_type(page);
- BUG_ON(magic == NODE_INFO);
+ BUG_ON(type == NODE_INFO);
maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
- removing_section_nr = page_private(page);
+ removing_section_nr = bootmem_info(page);
/*
* When this function is called, the removing section is
diff --git a/mm/swap.c b/mm/swap.c
index 59f30a981c6f..10decd9dffa1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -113,37 +113,6 @@ void __folio_put(struct folio *folio)
}
EXPORT_SYMBOL(__folio_put);
-/**
- * put_pages_list() - release a list of pages
- * @pages: list of pages threaded on page->lru
- *
- * Release a list of pages which are strung together on page.lru.
- */
-void put_pages_list(struct list_head *pages)
-{
- struct folio_batch fbatch;
- struct folio *folio, *next;
-
- folio_batch_init(&fbatch);
- list_for_each_entry_safe(folio, next, pages, lru) {
- if (!folio_put_testzero(folio))
- continue;
- if (folio_test_hugetlb(folio)) {
- free_huge_folio(folio);
- continue;
- }
- /* LRU flag must be clear because it's passed using the lru */
- if (folio_batch_add(&fbatch, folio) > 0)
- continue;
- free_unref_folios(&fbatch);
- }
-
- if (fbatch.nr)
- free_unref_folios(&fbatch);
- INIT_LIST_HEAD(pages);
-}
-EXPORT_SYMBOL(put_pages_list);
-
typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
static void lru_add(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4669f29cf555..e0c0321b8ff7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -889,8 +889,7 @@ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- enable_vma_readahead ? "true" : "false");
+ return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead));
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
diff --git a/mm/truncate.c b/mm/truncate.c
index 09fa809f921d..7c304d2f0052 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -23,42 +23,28 @@
#include <linux/rmap.h>
#include "internal.h"
-/*
- * Regular page slots are stabilized by the page lock even without the tree
- * itself locked. These unlocked entries need verification under the tree
- * lock.
- */
-static inline void __clear_shadow_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- XA_STATE(xas, &mapping->i_pages, index);
-
- xas_set_update(&xas, workingset_update_node);
- if (xas_load(&xas) != entry)
- return;
- xas_store(&xas, NULL);
-}
-
static void clear_shadow_entries(struct address_space *mapping,
- struct folio_batch *fbatch, pgoff_t *indices)
+ unsigned long start, unsigned long max)
{
- int i;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct folio *folio;
/* Handled by shmem itself, or for DAX we do nothing. */
if (shmem_mapping(mapping) || dax_mapping(mapping))
return;
- spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ xas_set_update(&xas, workingset_update_node);
- for (i = 0; i < folio_batch_count(fbatch); i++) {
- struct folio *folio = fbatch->folios[i];
+ spin_lock(&mapping->host->i_lock);
+ xas_lock_irq(&xas);
+ /* Clear all shadow entries from start to max */
+ xas_for_each(&xas, folio, max) {
if (xa_is_value(folio))
- __clear_shadow_entry(mapping, indices[i], folio);
+ xas_store(&xas, NULL);
}
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
@@ -68,54 +54,53 @@ static void clear_shadow_entries(struct address_space *mapping,
* Unconditionally remove exceptional entries. Usually called from truncate
* path. Note that the folio_batch may be altered by this function by removing
* exceptional entries similar to what folio_batch_remove_exceptionals() does.
+ * Please note that indices[] has entries in ascending order as guaranteed by
+ * either find_get_entries() or find_lock_entries().
*/
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
struct folio_batch *fbatch, pgoff_t *indices)
{
+ XA_STATE(xas, &mapping->i_pages, indices[0]);
+ int nr = folio_batch_count(fbatch);
+ struct folio *folio;
int i, j;
- bool dax;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
- for (j = 0; j < folio_batch_count(fbatch); j++)
+ for (j = 0; j < nr; j++)
if (xa_is_value(fbatch->folios[j]))
break;
- if (j == folio_batch_count(fbatch))
+ if (j == nr)
return;
- dax = dax_mapping(mapping);
- if (!dax) {
- spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ if (dax_mapping(mapping)) {
+ for (i = j; i < nr; i++) {
+ if (xa_is_value(fbatch->folios[i]))
+ dax_delete_mapping_entry(mapping, indices[i]);
+ }
+ goto out;
}
- for (i = j; i < folio_batch_count(fbatch); i++) {
- struct folio *folio = fbatch->folios[i];
- pgoff_t index = indices[i];
-
- if (!xa_is_value(folio)) {
- fbatch->folios[j++] = folio;
- continue;
- }
+ xas_set(&xas, indices[j]);
+ xas_set_update(&xas, workingset_update_node);
- if (unlikely(dax)) {
- dax_delete_mapping_entry(mapping, index);
- continue;
- }
+ spin_lock(&mapping->host->i_lock);
+ xas_lock_irq(&xas);
- __clear_shadow_entry(mapping, index, folio);
+ xas_for_each(&xas, folio, indices[nr-1]) {
+ if (xa_is_value(folio))
+ xas_store(&xas, NULL);
}
- if (!dax) {
- xa_unlock_irq(&mapping->i_pages);
- if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
- spin_unlock(&mapping->host->i_lock);
- }
- fbatch->nr = j;
+ xas_unlock_irq(&xas);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+out:
+ folio_batch_remove_exceptionals(fbatch);
}
/**
@@ -477,11 +462,13 @@ unsigned long mapping_try_invalidate(struct address_space *mapping,
unsigned long ret;
unsigned long count = 0;
int i;
- bool xa_has_values = false;
folio_batch_init(&fbatch);
while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ bool xa_has_values = false;
+ int nr = folio_batch_count(&fbatch);
+
+ for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
@@ -508,7 +495,7 @@ unsigned long mapping_try_invalidate(struct address_space *mapping,
}
if (xa_has_values)
- clear_shadow_entries(mapping, &fbatch, indices);
+ clear_shadow_entries(mapping, indices[0], indices[nr-1]);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
@@ -604,7 +591,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int ret = 0;
int ret2 = 0;
int did_range_unmap = 0;
- bool xa_has_values = false;
if (mapping_empty(mapping))
return 0;
@@ -612,7 +598,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ bool xa_has_values = false;
+ int nr = folio_batch_count(&fbatch);
+
+ for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
@@ -658,7 +647,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
if (xa_has_values)
- clear_shadow_entries(mapping, &fbatch, indices);
+ clear_shadow_entries(mapping, indices[0], indices[nr-1]);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ce13c4062647..60a0be33766f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -251,7 +251,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
if (!*foliop) {
ret = -ENOMEM;
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
- dst_addr, false);
+ dst_addr);
if (!folio)
goto out;
@@ -1135,7 +1135,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
-
+ pmd_t dummy_pmdval;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
@@ -1146,7 +1146,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
src_addr, src_addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
retry:
- dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
+ /*
+ * Use the maywrite version to indicate that dst_pte will be modified,
+ * but since we will use pte_same() to detect the change of the pte
+ * entry, there is no need to get pmdval, so just pass a dummy variable
+ * to it.
+ */
+ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval,
+ &dst_ptl);
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
@@ -1154,7 +1161,9 @@ retry:
goto out;
}
- src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
+ /* same as dst_pte */
+ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
+ &src_ptl);
/*
* We held the mmap_lock for reading so MADV_DONTNEED
diff --git a/mm/util.c b/mm/util.c
index 4f1275023eb7..60017d2a9e48 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -820,7 +820,7 @@ void *vcalloc_noprof(size_t n, size_t size)
}
EXPORT_SYMBOL(vcalloc_noprof);
-struct anon_vma *folio_anon_vma(struct folio *folio)
+struct anon_vma *folio_anon_vma(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
diff --git a/mm/vma.c b/mm/vma.c
index 7621384d64cf..8a454a7bbc80 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -7,6 +7,57 @@
#include "vma_internal.h"
#include "vma.h"
+struct mmap_state {
+ struct mm_struct *mm;
+ struct vma_iterator *vmi;
+
+ unsigned long addr;
+ unsigned long end;
+ pgoff_t pgoff;
+ unsigned long pglen;
+ unsigned long flags;
+ struct file *file;
+
+ unsigned long charged;
+ bool retry_merge;
+
+ struct vm_area_struct *prev;
+ struct vm_area_struct *next;
+
+ /* Unmapping state. */
+ struct vma_munmap_struct vms;
+ struct ma_state mas_detach;
+ struct maple_tree mt_detach;
+};
+
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+ struct mmap_state name = { \
+ .mm = mm_, \
+ .vmi = vmi_, \
+ .addr = addr_, \
+ .end = (addr_) + len, \
+ .pgoff = pgoff_, \
+ .pglen = PHYS_PFN(len_), \
+ .flags = flags_, \
+ .file = file_, \
+ }
+
+#define VMG_MMAP_STATE(name, map_, vma_) \
+ struct vma_merge_struct name = { \
+ .mm = (map_)->mm, \
+ .vmi = (map_)->vmi, \
+ .start = (map_)->addr, \
+ .end = (map_)->end, \
+ .flags = (map_)->flags, \
+ .pgoff = (map_)->pgoff, \
+ .file = (map_)->file, \
+ .prev = (map_)->prev, \
+ .vma = vma_, \
+ .next = (vma_) ? NULL : (map_)->next, \
+ .state = VMA_MERGE_START, \
+ .merge_flags = VMG_FLAG_DEFAULT, \
+ }
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -564,7 +615,11 @@ void validate_mm(struct mm_struct *mm)
anon_vma_unlock_read(anon_vma);
}
#endif
- i++;
+ /* Check for a infinite loop */
+ if (++i > mm->map_count + 10) {
+ i = -1;
+ break;
+ }
}
if (i != mm->map_count) {
pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
@@ -911,10 +966,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
{
struct vm_area_struct *prev = vmg->prev;
struct vm_area_struct *next = vmg->next;
- unsigned long start = vmg->start;
unsigned long end = vmg->end;
- pgoff_t pgoff = vmg->pgoff;
- pgoff_t pglen = PHYS_PFN(end - start);
bool can_merge_left, can_merge_right;
bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
@@ -936,7 +988,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
if (can_merge_right) {
vmg->end = next->vm_end;
vmg->vma = next;
- vmg->pgoff = next->vm_pgoff - pglen;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
@@ -970,16 +1021,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
return vmg->vma;
}
- /* If expansion failed, reset state. Allows us to retry merge later. */
- if (!just_expand) {
- vmg->vma = NULL;
- vmg->start = start;
- vmg->end = end;
- vmg->pgoff = pgoff;
- if (vmg->vma == prev)
- vma_iter_set(vmg->vmi, start);
- }
-
return NULL;
}
@@ -1103,7 +1144,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
vms->clear_ptes = false;
}
-void vms_clean_up_area(struct vma_munmap_struct *vms,
+static void vms_clean_up_area(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
struct vm_area_struct *vma;
@@ -1126,7 +1167,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms,
* used for the munmap() and may downgrade the lock - if requested. Everything
* needed to be done once the vma maple tree is updated.
*/
-void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
+static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
struct vm_area_struct *vma;
@@ -1168,6 +1209,23 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
}
/*
+ * reattach_vmas() - Undo any munmap work and free resources
+ * @mas_detach: The maple state with the detached maple tree
+ *
+ * Reattach any detached vmas and free up the maple tree used to track the vmas.
+ */
+static void reattach_vmas(struct ma_state *mas_detach)
+{
+ struct vm_area_struct *vma;
+
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ vma_mark_detached(vma, false);
+
+ __mt_destroy(mas_detach->tree);
+}
+
+/*
* vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
* for removal at a later date. Handles splitting first and last if necessary
* and marking the vmas as isolated.
@@ -1177,7 +1235,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
*
* Return: 0 on success, error otherwise
*/
-int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
+static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
struct vm_area_struct *next = NULL;
@@ -1254,7 +1312,7 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
else if (is_data_mapping(next->vm_flags))
vms->data_vm += nrpages;
- if (unlikely(vms->uf)) {
+ if (vms->uf) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
* will remain split, but userland will get a
@@ -1316,6 +1374,39 @@ map_count_exceeded:
}
/*
+ * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
+ * @vms: The vma munmap struct
+ * @vmi: The vma iterator
+ * @vma: The first vm_area_struct to munmap
+ * @start: The aligned start address to munmap
+ * @end: The aligned end address to munmap
+ * @uf: The userfaultfd list_head
+ * @unlock: Unlock after the operation. Only unlocked on success
+ */
+static void init_vma_munmap(struct vma_munmap_struct *vms,
+ struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, struct list_head *uf,
+ bool unlock)
+{
+ vms->vmi = vmi;
+ vms->vma = vma;
+ if (vma) {
+ vms->start = start;
+ vms->end = end;
+ } else {
+ vms->start = vms->end = 0;
+ }
+ vms->unlock = unlock;
+ vms->uf = uf;
+ vms->vma_count = 0;
+ vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
+ vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
+ vms->unmap_start = FIRST_USER_ADDRESS;
+ vms->unmap_end = USER_PGTABLES_CEILING;
+ vms->clear_ptes = false;
+}
+
+/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
* @vma: The starting vm_area_struct
@@ -2069,3 +2160,321 @@ void mm_drop_all_locks(struct mm_struct *mm)
mutex_unlock(&mm_all_locks_mutex);
}
+
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return false;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
+/*
+ * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
+ * operation.
+ * @vms: The vma unmap structure
+ * @mas_detach: The maple state with the detached maple tree
+ *
+ * Reattach any detached vmas, free up the maple tree used to track the vmas.
+ * If that's not possible because the ptes are cleared (and vm_ops->closed() may
+ * have been called), then a NULL is written over the vmas and the vmas are
+ * removed (munmap() completed).
+ */
+static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
+{
+ struct ma_state *mas = &vms->vmi->mas;
+
+ if (!vms->nr_pages)
+ return;
+
+ if (vms->clear_ptes)
+ return reattach_vmas(mas_detach);
+
+ /*
+ * Aborting cannot just call the vm_ops open() because they are often
+ * not symmetrical and state data has been lost. Resort to the old
+ * failure method of leaving a gap where the MAP_FIXED mapping failed.
+ */
+ mas_set_range(mas, vms->start, vms->end - 1);
+ mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
+ /* Clean up the insertion of the unfortunate gap */
+ vms_complete_munmap_vmas(vms, mas_detach);
+}
+
+/*
+ * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
+ * unmapped once the map operation is completed, check limits, account mapping
+ * and clean up any pre-existing VMAs.
+ *
+ * @map: Mapping state.
+ * @uf: Userfaultfd context list.
+ *
+ * Returns: 0 on success, error code otherwise.
+ */
+static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
+{
+ int error;
+ struct vma_iterator *vmi = map->vmi;
+ struct vma_munmap_struct *vms = &map->vms;
+
+ /* Find the first overlapping VMA and initialise unmap state. */
+ vms->vma = vma_find(vmi, map->end);
+ init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
+ /* unlock = */ false);
+
+ /* OK, we have overlapping VMAs - prepare to unmap them. */
+ if (vms->vma) {
+ mt_init_flags(&map->mt_detach,
+ vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_on_stack(map->mt_detach);
+ mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
+ /* Prepare to unmap any existing mapping in the area */
+ error = vms_gather_munmap_vmas(vms, &map->mas_detach);
+ if (error) {
+ /* On error VMAs will already have been reattached. */
+ vms->nr_pages = 0;
+ return error;
+ }
+
+ map->next = vms->next;
+ map->prev = vms->prev;
+ } else {
+ map->next = vma_iter_next_rewind(vmi, &map->prev);
+ }
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+ return -ENOMEM;
+
+ /* Private writable mapping: check memory availability. */
+ if (accountable_mapping(map->file, map->flags)) {
+ map->charged = map->pglen;
+ map->charged -= vms->nr_accounted;
+ if (map->charged) {
+ error = security_vm_enough_memory_mm(map->mm, map->charged);
+ if (error)
+ return error;
+ }
+
+ vms->nr_accounted = 0;
+ map->flags |= VM_ACCOUNT;
+ }
+
+ /*
+ * Clear PTEs while the vma is still in the tree so that rmap
+ * cannot race with the freeing later in the truncate scenario.
+ * This is also needed for mmap_file(), which is why vm_ops
+ * close function is called.
+ */
+ vms_clean_up_area(vms, &map->mas_detach);
+
+ return 0;
+}
+
+
+static int __mmap_new_file_vma(struct mmap_state *map,
+ struct vm_area_struct *vma)
+{
+ struct vma_iterator *vmi = map->vmi;
+ int error;
+
+ vma->vm_file = get_file(map->file);
+ error = mmap_file(vma->vm_file, vma);
+ if (error) {
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ vma_iter_set(vmi, vma->vm_end);
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(&vmi->mas, vma, map->prev, map->next);
+
+ return error;
+ }
+
+ /* Drivers cannot alter the address of the VMA. */
+ WARN_ON_ONCE(map->addr != vma->vm_start);
+ /*
+ * Drivers should not permit writability when previously it was
+ * disallowed.
+ */
+ VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
+ !(map->flags & VM_MAYWRITE) &&
+ (vma->vm_flags & VM_MAYWRITE));
+
+ /* If the flags change (and are mergeable), let's retry later. */
+ map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
+ map->flags = vma->vm_flags;
+
+ return 0;
+}
+
+/*
+ * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
+ * possible.
+ *
+ * @map: Mapping state.
+ * @vmap: Output pointer for the new VMA.
+ *
+ * Returns: Zero on success, or an error.
+ */
+static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
+{
+ struct vma_iterator *vmi = map->vmi;
+ int error = 0;
+ struct vm_area_struct *vma;
+
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(map->mm);
+ if (!vma)
+ return -ENOMEM;
+
+ vma_iter_config(vmi, map->addr, map->end);
+ vma_set_range(vma, map->addr, map->end, map->pgoff);
+ vm_flags_init(vma, map->flags);
+ vma->vm_page_prot = vm_get_page_prot(map->flags);
+
+ if (vma_iter_prealloc(vmi, vma)) {
+ error = -ENOMEM;
+ goto free_vma;
+ }
+
+ if (map->file)
+ error = __mmap_new_file_vma(map, vma);
+ else if (map->flags & VM_SHARED)
+ error = shmem_zero_setup(vma);
+ else
+ vma_set_anonymous(vma);
+
+ if (error)
+ goto free_iter_vma;
+
+#ifdef CONFIG_SPARC64
+ /* TODO: Fix SPARC ADI! */
+ WARN_ON_ONCE(!arch_validate_flags(map->flags));
+#endif
+
+ /* Lock the VMA since it is modified after insertion into VMA tree */
+ vma_start_write(vma);
+ vma_iter_store(vmi, vma);
+ map->mm->map_count++;
+ vma_link_file(vma);
+
+ /*
+ * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, map->flags);
+ ksm_add_vma(vma);
+ *vmap = vma;
+ return 0;
+
+free_iter_vma:
+ vma_iter_free(vmi);
+free_vma:
+ vm_area_free(vma);
+ return error;
+}
+
+/*
+ * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
+ * statistics, handle locking and finalise the VMA.
+ *
+ * @map: Mapping state.
+ * @vma: Merged or newly allocated VMA for the mmap()'d region.
+ */
+static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = map->mm;
+ unsigned long vm_flags = vma->vm_flags;
+
+ perf_event_mmap(vma);
+
+ /* Unmap any existing mapping in the area. */
+ vms_complete_munmap_vmas(&map->vms, &map->mas_detach);
+
+ vm_stat_account(mm, vma->vm_flags, map->pglen);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(mm))
+ vm_flags_clear(vma, VM_LOCKED_MASK);
+ else
+ mm->locked_vm += map->pglen;
+ }
+
+ if (vma->vm_file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vm_flags_set(vma, VM_SOFTDIRTY);
+
+ vma_set_page_prot(vma);
+}
+
+unsigned long __mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ int error;
+ VMA_ITERATOR(vmi, mm, addr);
+ MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+
+ error = __mmap_prepare(&map, uf);
+ if (error)
+ goto abort_munmap;
+
+ /* Attempt to merge with adjacent VMAs... */
+ if (map.prev || map.next) {
+ VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
+
+ vma = vma_merge_new_range(&vmg);
+ }
+
+ /* ...but if we can't, allocate a new VMA. */
+ if (!vma) {
+ error = __mmap_new_vma(&map, &vma);
+ if (error)
+ goto unacct_error;
+ }
+
+ /* If flags changed, we might be able to merge, so try again. */
+ if (map.retry_merge) {
+ VMG_MMAP_STATE(vmg, &map, vma);
+
+ vma_iter_config(map.vmi, map.addr, map.end);
+ vma_merge_existing_range(&vmg);
+ }
+
+ __mmap_complete(&map, vma);
+
+ return addr;
+
+ /* Accounting was done by __mmap_prepare(). */
+unacct_error:
+ if (map.charged)
+ vm_unacct_memory(map.charged);
+abort_munmap:
+ vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
+ return error;
+}
diff --git a/mm/vma.h b/mm/vma.h
index d58068c0ff2e..388d34748674 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -165,99 +165,6 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
return 0;
}
-#ifdef CONFIG_MMU
-/*
- * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
- * @vms: The vma munmap struct
- * @vmi: The vma iterator
- * @vma: The first vm_area_struct to munmap
- * @start: The aligned start address to munmap
- * @end: The aligned end address to munmap
- * @uf: The userfaultfd list_head
- * @unlock: Unlock after the operation. Only unlocked on success
- */
-static inline void init_vma_munmap(struct vma_munmap_struct *vms,
- struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, struct list_head *uf,
- bool unlock)
-{
- vms->vmi = vmi;
- vms->vma = vma;
- if (vma) {
- vms->start = start;
- vms->end = end;
- } else {
- vms->start = vms->end = 0;
- }
- vms->unlock = unlock;
- vms->uf = uf;
- vms->vma_count = 0;
- vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
- vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
- vms->unmap_start = FIRST_USER_ADDRESS;
- vms->unmap_end = USER_PGTABLES_CEILING;
- vms->clear_ptes = false;
-}
-#endif
-
-int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach);
-
-void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach);
-
-void vms_clean_up_area(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach);
-
-/*
- * reattach_vmas() - Undo any munmap work and free resources
- * @mas_detach: The maple state with the detached maple tree
- *
- * Reattach any detached vmas and free up the maple tree used to track the vmas.
- */
-static inline void reattach_vmas(struct ma_state *mas_detach)
-{
- struct vm_area_struct *vma;
-
- mas_set(mas_detach, 0);
- mas_for_each(mas_detach, vma, ULONG_MAX)
- vma_mark_detached(vma, false);
-
- __mt_destroy(mas_detach->tree);
-}
-
-/*
- * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
- * operation.
- * @vms: The vma unmap structure
- * @mas_detach: The maple state with the detached maple tree
- *
- * Reattach any detached vmas, free up the maple tree used to track the vmas.
- * If that's not possible because the ptes are cleared (and vm_ops->closed() may
- * have been called), then a NULL is written over the vmas and the vmas are
- * removed (munmap() completed).
- */
-static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach)
-{
- struct ma_state *mas = &vms->vmi->mas;
- if (!vms->nr_pages)
- return;
-
- if (vms->clear_ptes)
- return reattach_vmas(mas_detach);
-
- /*
- * Aborting cannot just call the vm_ops open() because they are often
- * not symmetrical and state data has been lost. Resort to the old
- * failure method of leaving a gap where the MAP_FIXED mapping failed.
- */
- mas_set_range(mas, vms->start, vms->end - 1);
- mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
- /* Clean up the insertion of the unfortunate gap */
- vms_complete_munmap_vmas(vms, mas_detach);
-}
-
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
@@ -336,6 +243,10 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);
+unsigned long __mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf);
+
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
/*
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index b930ab12a587..fc5f172a36bd 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -17,8 +17,10 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/huge_mm.h>
+#include <linux/hugetlb.h>
#include <linux/hugetlb_inline.h>
#include <linux/kernel.h>
+#include <linux/ksm.h>
#include <linux/khugepaged.h>
#include <linux/list.h>
#include <linux/maple_tree.h>
@@ -32,11 +34,14 @@
#include <linux/mmu_context.h>
#include <linux/mutex.h>
#include <linux/pagemap.h>
+#include <linux/perf_event.h>
#include <linux/pfn.h>
#include <linux/rcupdate.h>
#include <linux/rmap.h>
#include <linux/rwsem.h>
#include <linux/sched/signal.h>
+#include <linux/security.h>
+#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uprobes.h>
#include <linux/userfaultfd_k.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 634162271c00..7ed39d104201 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
* RETURNS:
* 0 on success, -errno on failure.
*/
-static int vmap_pages_range(unsigned long addr, unsigned long end,
+int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
int err;
@@ -2182,6 +2182,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
reclaim_list_global(&decay_list);
}
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+ struct vmap_area *va;
+ unsigned long start, end;
+
+ start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+ end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+ list_for_each_entry(va, &vn->purge_list, list) {
+ if (is_vmalloc_or_module_addr((void *) va->va_start))
+ kasan_release_vmalloc(va->va_start, va->va_end,
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE);
+ }
+
+ kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+}
+
static void purge_vmap_node(struct work_struct *work)
{
struct vmap_node *vn = container_of(work,
@@ -2190,20 +2209,17 @@ static void purge_vmap_node(struct work_struct *work)
struct vmap_area *va, *n_va;
LIST_HEAD(local_list);
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_release_vmalloc_node(vn);
+
vn->nr_purged = 0;
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
unsigned long nr = va_size(va) >> PAGE_SHIFT;
- unsigned long orig_start = va->va_start;
- unsigned long orig_end = va->va_end;
unsigned int vn_id = decode_vn_id(va->flags);
list_del_init(&va->list);
- if (is_vmalloc_or_module_addr((void *)orig_start))
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
-
nr_purged_pages += nr;
vn->nr_purged++;
@@ -3007,6 +3023,11 @@ static inline unsigned int vm_area_page_order(struct vm_struct *vm)
#endif
}
+unsigned int get_vm_area_page_order(struct vm_struct *vm)
+{
+ return vm_area_page_order(vm);
+}
+
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
@@ -3085,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
vm->flags &= ~VM_UNINITIALIZED;
}
-static struct vm_struct *__get_vm_area_node(unsigned long size,
+struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long flags,
unsigned long start, unsigned long end, int node,
gfp_t gfp_mask, const void *caller)
@@ -3763,8 +3784,6 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
}
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
- unsigned long size_per_node;
-
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
@@ -3772,13 +3791,10 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
* supporting them.
*/
- size_per_node = size;
- if (node == NUMA_NO_NODE)
- size_per_node /= num_online_nodes();
- if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
+ if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
shift = PMD_SHIFT;
else
- shift = arch_vmap_pte_supported_shift(size_per_node);
+ shift = arch_vmap_pte_supported_shift(size);
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
@@ -4784,7 +4800,8 @@ recovery:
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
}
@@ -4834,7 +4851,8 @@ err_free_shadow:
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ va->va_start, va->va_end,
+ KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
vas[area] = NULL;
kfree(vms[area]);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28ba2b06fc7d..76378bc257e3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1258,8 +1258,8 @@ retry:
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
}
- count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
#endif
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
if (!add_to_swap(folio))
goto activate_locked_split;
}
@@ -2129,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
static unsigned int reclaim_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat)
{
- struct reclaim_stat dummy_stat;
+ struct reclaim_stat stat;
unsigned int nr_reclaimed;
struct folio *folio;
struct scan_control sc = {
@@ -2140,12 +2140,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
folio_putback_lru(folio);
}
+ trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat);
return nr_reclaimed;
}
@@ -2602,8 +2603,6 @@ static bool should_clear_pmd_young(void)
* shorthand helpers
******************************************************************************/
-#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-
#define DEFINE_MAX_SEQ(lruvec) \
unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
@@ -3138,7 +3137,6 @@ static int folio_update_gen(struct folio *folio, int gen)
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- VM_WARN_ON_ONCE(!rcu_read_lock_held());
do {
/* lru_gen_del_folio() has isolated this page? */
@@ -3354,7 +3352,7 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
if (folio_nid(folio) != pgdat->node_id)
return NULL;
- if (folio_memcg_rcu(folio) != memcg)
+ if (folio_memcg(folio) != memcg)
return NULL;
/* file VMAs can contain anon pages from COW */
@@ -3386,8 +3384,10 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ pmd_t pmdval;
- pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
+ &ptl);
if (!pte)
return false;
if (!spin_trylock(ptl)) {
@@ -3395,6 +3395,11 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
return false;
}
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+
arch_enter_lazy_mmu_mode();
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
@@ -3643,10 +3648,8 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
.p4d_entry = walk_pud_range,
.walk_lock = PGWALK_RDLOCK,
};
-
int err;
struct lruvec *lruvec = walk->lruvec;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3659,10 +3662,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
if (walk->seq != max_seq)
break;
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- break;
-
/* the caller might be holding the lock for write */
if (mmap_read_trylock(mm)) {
err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
@@ -3670,8 +3669,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
mmap_read_unlock(mm);
}
- mem_cgroup_unlock_pages();
-
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(walk);
@@ -4093,10 +4090,6 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
}
- /* folio_update_gen() requires stable folio_memcg() */
- if (!mem_cgroup_trylock_pages(memcg))
- return true;
-
arch_enter_lazy_mmu_mode();
pte -= (addr - start) / PAGE_SIZE;
@@ -4134,12 +4127,13 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
old_gen = folio_lru_gen(folio);
if (old_gen < 0)
folio_set_referenced(folio);
- else if (old_gen != new_gen)
+ else if (old_gen != new_gen) {
+ folio_clear_lru_refs(folio);
folio_activate(folio);
+ }
}
arch_leave_lazy_mmu_mode();
- mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
@@ -4290,6 +4284,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int tier_idx)
{
bool success;
+ bool dirty, writeback;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -4335,9 +4330,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
return true;
}
+ dirty = folio_test_dirty(folio);
+ writeback = folio_test_writeback(folio);
+ if (type == LRU_GEN_FILE && dirty) {
+ sc->nr.file_taken += delta;
+ if (!writeback)
+ sc->nr.unqueued_dirty += delta;
+ }
+
/* waiting for writeback */
- if (folio_test_locked(folio) || folio_test_writeback(folio) ||
- (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ if (folio_test_locked(folio) || writeback ||
+ (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4368,7 +4371,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on MAX_NR_TIERS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+ folio_clear_lru_refs(folio);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4453,7 +4456,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
-
+ if (type == LRU_GEN_FILE)
+ sc->nr.file_taken += isolated;
/*
* There might not be eligible folios due to reclaim_idx. Check the
* remaining to prevent livelock if it's not making progress.
@@ -4587,6 +4591,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
return scanned;
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
scanned, reclaimed, &stat, sc->priority,
@@ -4795,6 +4800,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
}
+ /*
+ * If too many file cache in the coldest generation can't be evicted
+ * due to being dirty, wake up the flusher.
+ */
+ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
/* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -5940,6 +5952,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
bool reclaimable = false;
if (lru_gen_enabled() && root_reclaim(sc)) {
+ memset(&sc->nr, 0, sizeof(sc->nr));
lru_gen_shrink_node(pgdat, sc);
return;
}
@@ -5990,7 +6003,8 @@ again:
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/
- if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ if (sc->nr.unqueued_dirty &&
+ sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ac6a5aa34eab..4d016314a56c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1273,6 +1273,9 @@ const char * const vmstat_text[] = {
"pgdemote_kswapd",
"pgdemote_direct",
"pgdemote_khugepaged",
+#ifdef CONFIG_HUGETLB_PAGE
+ "nr_hugetlb",
+#endif
/* system-wide enum vm_stat_item counters */
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -1780,6 +1783,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
#ifdef CONFIG_NUMA
+ fold_vm_zone_numa_events(zone);
for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
zone_numa_event_state(zone, i));
@@ -1793,13 +1797,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
seq_printf(m,
"\n cpu: %i"
- "\n count: %i"
- "\n high: %i"
- "\n batch: %i",
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i"
+ "\n high_min: %i"
+ "\n high_max: %i",
i,
pcp->count,
pcp->high,
- pcp->batch);
+ pcp->batch,
+ pcp->high_min,
+ pcp->high_max);
#ifdef CONFIG_SMP
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
seq_printf(m, "\n vm stats threshold: %d",
@@ -1931,6 +1939,7 @@ static const struct seq_operations vmstat_op = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static int vmstat_late_init_done;
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
@@ -2133,7 +2142,8 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
- refresh_zone_stat_thresholds();
+ if (vmstat_late_init_done)
+ refresh_zone_stat_thresholds();
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
@@ -2165,6 +2175,14 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
}
+static int __init vmstat_late_init(void)
+{
+ refresh_zone_stat_thresholds();
+ vmstat_late_init_done = 1;
+
+ return 0;
+}
+late_initcall(vmstat_late_init);
#endif
struct workqueue_struct *mm_percpu_wq;
diff --git a/mm/workingset.c b/mm/workingset.c
index a2b28e356e68..a4705e196545 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -591,22 +591,12 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
void workingset_activation(struct folio *folio)
{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
- *
- * XXX: See workingset_refault() - this should return
- * root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = folio_memcg_rcu(folio);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
- rcu_read_unlock();
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
}
/*
@@ -712,8 +702,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
- void *arg) __must_hold(lru_lock)
+ void *arg) __must_hold(lru->lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
struct address_space *mapping;
@@ -722,20 +711,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/*
* Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
- * lru_lock. Because the page cache tree is emptied before
- * the inode can be destroyed, holding the lru_lock pins any
+ * &lru->lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the &lru->lock pins any
* address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
- * to reclaim, take the node off-LRU, and drop the lru_lock.
+ * to reclaim, take the node off-LRU, and drop the &lru->lock.
*/
mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
@@ -744,7 +733,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (mapping->host != NULL) {
if (!spin_trylock(&mapping->host->i_lock)) {
xa_unlock(&mapping->i_pages);
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
@@ -753,7 +742,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
list_lru_isolate(lru, item);
__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
/*
* The nodes should only contain one or more shadow entries,
@@ -777,7 +766,6 @@ out_invalid:
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
- spin_lock_irq(lru_lock);
return ret;
}
@@ -823,8 +811,8 @@ static int __init workingset_init(void)
if (!workingset_shadow_shrinker)
goto err;
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
- workingset_shadow_shrinker);
+ ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
+ &shadow_nodes_key);
if (ret)
goto err_list_lru;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 16a07def09c9..64b66a4d3e6e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
/*
* zsmalloc memory allocator
*
@@ -261,7 +263,7 @@ struct zspage {
struct mapping_area {
local_lock_t lock;
char *vm_buf; /* copy buffer for objects that span pages */
- char *vm_addr; /* address of kmap_atomic()'ed pages */
+ char *vm_addr; /* address of kmap_local_page()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
};
@@ -898,7 +900,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
set_first_obj_offset(page, off);
- vaddr = kmap_atomic(page);
+ vaddr = kmap_local_page(page);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
@@ -921,7 +923,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
*/
link->next = -1UL << OBJ_TAG_BITS;
}
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
@@ -1044,11 +1046,10 @@ static inline void __zs_cpu_down(struct mapping_area *area)
static void *__zs_map_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- int sizes[2];
- void *addr;
+ size_t sizes[2];
char *buf = area->vm_buf;
- /* disable page faults to match kmap_atomic() return conditions */
+ /* disable page faults to match kmap_local_page() return conditions */
pagefault_disable();
/* no read fastpath */
@@ -1059,12 +1060,8 @@ static void *__zs_map_object(struct mapping_area *area,
sizes[1] = size - sizes[0];
/* copy object to per-cpu buffer */
- addr = kmap_atomic(pages[0]);
- memcpy(buf, addr + off, sizes[0]);
- kunmap_atomic(addr);
- addr = kmap_atomic(pages[1]);
- memcpy(buf + sizes[0], addr, sizes[1]);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, pages[0], off, sizes[0]);
+ memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]);
out:
return area->vm_buf;
}
@@ -1072,8 +1069,7 @@ out:
static void __zs_unmap_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- int sizes[2];
- void *addr;
+ size_t sizes[2];
char *buf;
/* no write fastpath */
@@ -1089,15 +1085,11 @@ static void __zs_unmap_object(struct mapping_area *area,
sizes[1] = size - sizes[0];
/* copy per-cpu buffer to object */
- addr = kmap_atomic(pages[0]);
- memcpy(addr + off, buf, sizes[0]);
- kunmap_atomic(addr);
- addr = kmap_atomic(pages[1]);
- memcpy(addr, buf + sizes[0], sizes[1]);
- kunmap_atomic(addr);
+ memcpy_to_page(pages[0], off, buf, sizes[0]);
+ memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]);
out:
- /* enable page faults to match kunmap_atomic() return conditions */
+ /* enable page faults to match kunmap_local() return conditions */
pagefault_enable();
}
@@ -1223,7 +1215,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
- area->vm_addr = kmap_atomic(page);
+ area->vm_addr = kmap_local_page(page);
ret = area->vm_addr + off;
goto out;
}
@@ -1260,7 +1252,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
- kunmap_atomic(area->vm_addr);
+ kunmap_local(area->vm_addr);
else {
struct page *pages[2];
@@ -1318,7 +1310,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
for (i = 0; i < nr_page; i++)
m_page = get_next_page(m_page);
- vaddr = kmap_atomic(m_page);
+ vaddr = kmap_local_page(m_page);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
if (likely(!ZsHugePage(zspage)))
@@ -1328,7 +1320,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
/* record handle to page->index */
zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
mod_zspage_inuse(zspage, 1);
obj = location_to_obj(m_page, obj);
@@ -1419,7 +1411,7 @@ static void obj_free(int class_size, unsigned long obj)
f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_page);
- vaddr = kmap_atomic(f_page);
+ vaddr = kmap_local_page(f_page);
link = (struct link_free *)(vaddr + f_offset);
/* Insert this object in containing zspage's freelist */
@@ -1429,7 +1421,7 @@ static void obj_free(int class_size, unsigned long obj)
f_page->index = 0;
set_freeobj(zspage, f_objidx);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
mod_zspage_inuse(zspage, -1);
}
@@ -1492,8 +1484,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (d_off + class->size > PAGE_SIZE)
d_size = PAGE_SIZE - d_off;
- s_addr = kmap_atomic(s_page);
- d_addr = kmap_atomic(d_page);
+ s_addr = kmap_local_page(s_page);
+ d_addr = kmap_local_page(d_page);
while (1) {
size = min(s_size, d_size);
@@ -1509,33 +1501,33 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
d_size -= size;
/*
- * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
- * calls must occurs in reverse order of calls to kmap_atomic().
- * So, to call kunmap_atomic(s_addr) we should first call
- * kunmap_atomic(d_addr). For more details see
+ * Calling kunmap_local(d_addr) is necessary. kunmap_local()
+ * calls must occurs in reverse order of calls to kmap_local_page().
+ * So, to call kunmap_local(s_addr) we should first call
+ * kunmap_local(d_addr). For more details see
* Documentation/mm/highmem.rst.
*/
if (s_off >= PAGE_SIZE) {
- kunmap_atomic(d_addr);
- kunmap_atomic(s_addr);
+ kunmap_local(d_addr);
+ kunmap_local(s_addr);
s_page = get_next_page(s_page);
- s_addr = kmap_atomic(s_page);
- d_addr = kmap_atomic(d_page);
+ s_addr = kmap_local_page(s_page);
+ d_addr = kmap_local_page(d_page);
s_size = class->size - written;
s_off = 0;
}
if (d_off >= PAGE_SIZE) {
- kunmap_atomic(d_addr);
+ kunmap_local(d_addr);
d_page = get_next_page(d_page);
- d_addr = kmap_atomic(d_page);
+ d_addr = kmap_local_page(d_page);
d_size = class->size - written;
d_off = 0;
}
}
- kunmap_atomic(d_addr);
- kunmap_atomic(s_addr);
+ kunmap_local(d_addr);
+ kunmap_local(s_addr);
}
/*
@@ -1548,7 +1540,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
- void *addr = kmap_atomic(page);
+ void *addr = kmap_local_page(page);
offset = get_first_obj_offset(page);
offset += class->size * index;
@@ -1561,7 +1553,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
index++;
}
- kunmap_atomic(addr);
+ kunmap_local(addr);
*obj_idx = index;
@@ -1798,14 +1790,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
migrate_write_lock(zspage);
offset = get_first_obj_offset(page);
- s_addr = kmap_atomic(page);
+ s_addr = kmap_local_page(page);
/*
* Here, any user cannot access all objects in the zspage so let's move.
*/
- d_addr = kmap_atomic(newpage);
+ d_addr = kmap_local_page(newpage);
copy_page(d_addr, s_addr);
- kunmap_atomic(d_addr);
+ kunmap_local(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
@@ -1818,7 +1810,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
record_obj(handle, new_obj);
}
}
- kunmap_atomic(s_addr);
+ kunmap_local(s_addr);
replace_sub_page(class, zspage, newpage, page);
/*
diff --git a/mm/zswap.c b/mm/zswap.c
index 0030ce8fecfc..f6316b66fb23 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -402,7 +402,7 @@ static void __zswap_pool_empty(struct percpu_ref *ref)
spin_unlock_bh(&zswap_pools_lock);
}
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
+static int __must_check zswap_pool_tryget(struct zswap_pool *pool)
{
if (!pool)
return 0;
@@ -410,6 +410,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
return percpu_ref_tryget(&pool->ref);
}
+/* The caller must already have a reference. */
+static void zswap_pool_get(struct zswap_pool *pool)
+{
+ percpu_ref_get(&pool->ref);
+}
+
static void zswap_pool_put(struct zswap_pool *pool)
{
percpu_ref_put(&pool->ref);
@@ -440,7 +446,7 @@ static struct zswap_pool *zswap_pool_current_get(void)
rcu_read_lock();
pool = __zswap_pool_current();
- if (!zswap_pool_get(pool))
+ if (!zswap_pool_tryget(pool))
pool = NULL;
rcu_read_unlock();
@@ -461,7 +467,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
if (strcmp(zpool_get_type(pool->zpool), type))
continue;
/* if we can't get it, it's about to be destroyed */
- if (!zswap_pool_get(pool))
+ if (!zswap_pool_tryget(pool))
continue;
return pool;
}
@@ -703,12 +709,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
/*
* Note that it is safe to use rcu_read_lock() here, even in the face of
- * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
- * used in list_lru lookup, only two scenarios are possible:
+ * concurrent memcg offlining:
*
- * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+ * 1. list_lru_add() is called before list_lru_one is dead. The
* new entry will be reparented to memcg's parent's list_lru.
- * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+ * 2. list_lru_add() is called after list_lru_one is dead. The
* new entry will be added directly to memcg's parent's list_lru.
*
* Similar reasoning holds for list_lru_del().
@@ -802,7 +807,7 @@ static void zswap_entry_free(struct zswap_entry *entry)
obj_cgroup_put(entry->objcg);
}
zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
+ atomic_long_dec(&zswap_stored_pages);
}
/*********************************
@@ -875,7 +880,8 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
return 0;
}
-static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+static bool zswap_compress(struct page *page, struct zswap_entry *entry,
+ struct zswap_pool *pool)
{
struct crypto_acomp_ctx *acomp_ctx;
struct scatterlist input, output;
@@ -887,13 +893,13 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
mutex_lock(&acomp_ctx->mutex);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
- sg_set_folio(&input, folio, PAGE_SIZE, 0);
+ sg_set_page(&input, page, PAGE_SIZE, 0);
/*
* We need PAGE_SIZE * 2 here since there maybe over-compression case,
@@ -920,7 +926,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
if (comp_ret)
goto unlock;
- zpool = entry->pool->zpool;
+ zpool = pool->zpool;
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
if (zpool_malloc_support_movable(zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
@@ -1096,7 +1102,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* for reclaim by this ratio.
*/
static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg)
+ void *arg)
{
struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
bool *encountered_page_in_swapcache = (bool *)arg;
@@ -1152,7 +1158,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* It's safe to drop the lock here because we return either
* LRU_REMOVED_RETRY or LRU_RETRY.
*/
- spin_unlock(lock);
+ spin_unlock(&l->lock);
writeback_result = zswap_writeback_entry(entry, swpentry);
@@ -1173,7 +1179,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
zswap_written_back_pages++;
}
- spin_lock(lock);
return ret;
}
@@ -1233,7 +1238,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else {
nr_backing = zswap_total_pages();
- nr_stored = atomic_read(&zswap_stored_pages);
+ nr_stored = atomic_long_read(&zswap_stored_pages);
}
if (!nr_stored)
@@ -1403,68 +1408,27 @@ resched:
/*********************************
* main API
**********************************/
-bool zswap_store(struct folio *folio)
+
+static ssize_t zswap_store_page(struct page *page,
+ struct obj_cgroup *objcg,
+ struct zswap_pool *pool)
{
- swp_entry_t swp = folio->swap;
- pgoff_t offset = swp_offset(swp);
- struct xarray *tree = swap_zswap_tree(swp);
+ swp_entry_t page_swpentry = page_swap_entry(page);
struct zswap_entry *entry, *old;
- struct obj_cgroup *objcg = NULL;
- struct mem_cgroup *memcg = NULL;
-
- VM_WARN_ON_ONCE(!folio_test_locked(folio));
- VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
-
- /* Large folios aren't supported */
- if (folio_test_large(folio))
- return false;
-
- if (!zswap_enabled)
- goto check_old;
-
- /* Check cgroup limits */
- objcg = get_obj_cgroup_from_folio(folio);
- if (objcg && !obj_cgroup_may_zswap(objcg)) {
- memcg = get_mem_cgroup_from_objcg(objcg);
- if (shrink_memcg(memcg)) {
- mem_cgroup_put(memcg);
- goto reject;
- }
- mem_cgroup_put(memcg);
- }
-
- if (zswap_check_limits())
- goto reject;
/* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
+ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
- goto reject;
- }
-
- /* if entry is successfully added, it keeps the reference */
- entry->pool = zswap_pool_current_get();
- if (!entry->pool)
- goto freepage;
-
- if (objcg) {
- memcg = get_mem_cgroup_from_objcg(objcg);
- if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
- mem_cgroup_put(memcg);
- goto put_pool;
- }
- mem_cgroup_put(memcg);
+ return -EINVAL;
}
- if (!zswap_compress(folio, entry))
- goto put_pool;
-
- entry->swpentry = swp;
- entry->objcg = objcg;
- entry->referenced = true;
+ if (!zswap_compress(page, entry, pool))
+ goto compress_failed;
- old = xa_store(tree, offset, entry, GFP_KERNEL);
+ old = xa_store(swap_zswap_tree(page_swpentry),
+ swp_offset(page_swpentry),
+ entry, GFP_KERNEL);
if (xa_is_err(old)) {
int err = xa_err(old);
@@ -1481,10 +1445,15 @@ bool zswap_store(struct folio *folio)
if (old)
zswap_entry_free(old);
- if (objcg) {
- obj_cgroup_charge_zswap(objcg, entry->length);
- count_objcg_events(objcg, ZSWPOUT, 1);
- }
+ /*
+ * The entry is successfully compressed and stored in the tree, there is
+ * no further possibility of failure. Grab refs to the pool and objcg.
+ * These refs will be dropped by zswap_entry_free() when the entry is
+ * removed from the tree.
+ */
+ zswap_pool_get(pool);
+ if (objcg)
+ obj_cgroup_get(objcg);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1496,37 +1465,115 @@ bool zswap_store(struct folio *folio)
* The publishing order matters to prevent writeback from seeing
* an incoherent entry.
*/
+ entry->pool = pool;
+ entry->swpentry = page_swpentry;
+ entry->objcg = objcg;
+ entry->referenced = true;
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&zswap_list_lru, entry);
}
- /* update stats */
- atomic_inc(&zswap_stored_pages);
- count_vm_event(ZSWPOUT);
-
- return true;
+ return entry->length;
store_failed:
- zpool_free(entry->pool->zpool, entry->handle);
-put_pool:
- zswap_pool_put(entry->pool);
-freepage:
+ zpool_free(pool->zpool, entry->handle);
+compress_failed:
zswap_entry_cache_free(entry);
-reject:
+ return -EINVAL;
+}
+
+bool zswap_store(struct folio *folio)
+{
+ long nr_pages = folio_nr_pages(folio);
+ swp_entry_t swp = folio->swap;
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg = NULL;
+ struct zswap_pool *pool;
+ size_t compressed_bytes = 0;
+ bool ret = false;
+ long index;
+
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
+
+ if (!zswap_enabled)
+ goto check_old;
+
+ objcg = get_obj_cgroup_from_folio(folio);
+ if (objcg && !obj_cgroup_may_zswap(objcg)) {
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ if (shrink_memcg(memcg)) {
+ mem_cgroup_put(memcg);
+ goto put_objcg;
+ }
+ mem_cgroup_put(memcg);
+ }
+
+ if (zswap_check_limits())
+ goto put_objcg;
+
+ pool = zswap_pool_current_get();
+ if (!pool)
+ goto put_objcg;
+
+ if (objcg) {
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
+ mem_cgroup_put(memcg);
+ goto put_pool;
+ }
+ mem_cgroup_put(memcg);
+ }
+
+ for (index = 0; index < nr_pages; ++index) {
+ struct page *page = folio_page(folio, index);
+ ssize_t bytes;
+
+ bytes = zswap_store_page(page, objcg, pool);
+ if (bytes < 0)
+ goto put_pool;
+ compressed_bytes += bytes;
+ }
+
+ if (objcg) {
+ obj_cgroup_charge_zswap(objcg, compressed_bytes);
+ count_objcg_events(objcg, ZSWPOUT, nr_pages);
+ }
+
+ atomic_long_add(nr_pages, &zswap_stored_pages);
+ count_vm_events(ZSWPOUT, nr_pages);
+
+ ret = true;
+
+put_pool:
+ zswap_pool_put(pool);
+put_objcg:
obj_cgroup_put(objcg);
- if (zswap_pool_reached_full)
+ if (!ret && zswap_pool_reached_full)
queue_work(shrink_wq, &zswap_shrink_work);
check_old:
/*
- * If the zswap store fails or zswap is disabled, we must invalidate the
- * possibly stale entry which was previously stored at this offset.
- * Otherwise, writeback could overwrite the new data in the swapfile.
+ * If the zswap store fails or zswap is disabled, we must invalidate
+ * the possibly stale entries which were previously stored at the
+ * offsets corresponding to each page of the folio. Otherwise,
+ * writeback could overwrite the new data in the swapfile.
*/
- entry = xa_erase(tree, offset);
- if (entry)
- zswap_entry_free(entry);
- return false;
+ if (!ret) {
+ unsigned type = swp_type(swp);
+ pgoff_t offset = swp_offset(swp);
+ struct zswap_entry *entry;
+ struct xarray *tree;
+
+ for (index = 0; index < nr_pages; ++index) {
+ tree = swap_zswap_tree(swp_entry(type, offset + index));
+ entry = xa_erase(tree, offset + index);
+ if (entry)
+ zswap_entry_free(entry);
+ }
+ }
+
+ return ret;
}
bool zswap_load(struct folio *folio)
@@ -1594,6 +1641,9 @@ void zswap_invalidate(swp_entry_t swp)
struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
+ if (xa_empty(tree))
+ return;
+
entry = xa_erase(tree, offset);
if (entry)
zswap_entry_free(entry);
@@ -1651,6 +1701,13 @@ static int debugfs_get_total_size(void *data, u64 *val)
}
DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
+static int debugfs_get_stored_pages(void *data, u64 *val)
+{
+ *val = atomic_long_read(&zswap_stored_pages);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1674,8 +1731,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_written_back_pages);
debugfs_create_file("pool_total_size", 0444,
zswap_debugfs_root, NULL, &total_size_fops);
- debugfs_create_atomic_t("stored_pages", 0444,
- zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_file("stored_pages", 0444,
+ zswap_debugfs_root, NULL, &stored_pages_fops);
return 0;
}
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 3f43edef813c..711c6e029936 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -50,7 +50,7 @@ SECTIONS {
.data : {
*(.data .data.[0-9a-zA-Z_]*)
*(.data..L*)
- CODETAG_SECTIONS()
+ MOD_CODETAG_SECTIONS()
}
.rodata : {
@@ -59,9 +59,10 @@ SECTIONS {
}
#else
.data : {
- CODETAG_SECTIONS()
+ MOD_CODETAG_SECTIONS()
}
#endif
+ MOD_SEPARATE_CODETAG_SECTIONS()
}
/* bring in arch-specific sections */
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..1ea2c4c33b86 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,9 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */
+#define MADV_GUARD_REMOVE 103 /* unguard range */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c
index e1f264444342..880e36df0c11 100644
--- a/tools/mm/page_owner_sort.c
+++ b/tools/mm/page_owner_sort.c
@@ -377,6 +377,7 @@ static char *get_comm(char *buf)
if (errno != 0) {
if (debug_on)
fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf);
+ free(comm_str);
return NULL;
}
diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c
index 04e9e6ba86ea..1433eff99feb 100644
--- a/tools/mm/slabinfo.c
+++ b/tools/mm/slabinfo.c
@@ -21,7 +21,7 @@
#include <regex.h>
#include <errno.h>
-#define MAX_SLABS 500
+#define MAX_SLABS 2000
#define MAX_ALIASES 500
#define MAX_NODES 1024
@@ -1228,6 +1228,8 @@ static void read_slab_dir(void)
continue;
switch (de->d_type) {
case DT_LNK:
+ if (alias - aliasinfo == MAX_ALIASES)
+ fatal("Too many aliases\n");
alias->name = strdup(de->d_name);
count = readlink(de->d_name, buffer, sizeof(buffer)-1);
@@ -1242,6 +1244,8 @@ static void read_slab_dir(void)
alias++;
break;
case DT_DIR:
+ if (slab - slabinfo == MAX_SLABS)
+ fatal("Too many slabs\n");
if (chdir(de->d_name))
fatal("Unable to access slab %s\n", slab->name);
slab->name = strdup(de->d_name);
@@ -1312,10 +1316,6 @@ static void read_slab_dir(void)
slabs = slab - slabinfo;
actual_slabs = slabs;
aliases = alias - aliasinfo;
- if (slabs > MAX_SLABS)
- fatal("Too many slabs\n");
- if (aliases > MAX_ALIASES)
- fatal("Too many aliases\n");
}
static void output_slabs(void)
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 551ae6898c1d..bc30050227fd 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -462,6 +462,28 @@ static noinline void __init check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1);
mas_destroy(&mas);
+ mas.node = MA_ERROR(-ENOMEM);
+ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */
+ mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+ mas.node = MA_ERROR(-ENOMEM);
+ mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */
+ mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+ mas.status = ma_start;
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2);
+ mas_destroy(&mas);
+
+ mas.node = MA_ERROR(-ENOMEM);
+ mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */
+ mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1);
+ mas.node = MA_ERROR(-ENOMEM);
+ mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */
+ mas_nomem(&mas, GFP_KERNEL); /* Fill request */
+ mas.status = ma_start;
+ MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2);
+ mas_destroy(&mas);
+
mtree_unlock(mt);
}
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
index aa995516870b..54d45791b0d9 100644
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -8,7 +8,12 @@ test_write_result() {
expect_reason=$4
expected=$5
- echo "$content" > "$file"
+ if [ "$expected" = "0" ]
+ then
+ echo "$content" > "$file"
+ else
+ echo "$content" > "$file" 2> /dev/null
+ fi
if [ $? -ne "$expected" ]
then
echo "writing $content to $file doesn't return $expected"
diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c
index 3be121487432..a9f4e9aaf3a9 100644
--- a/tools/testing/selftests/damon/access_memory_even.c
+++ b/tools/testing/selftests/damon/access_memory_even.c
@@ -14,10 +14,8 @@
int main(int argc, char *argv[])
{
char **regions;
- clock_t start_clock;
int nr_regions;
int sz_region;
- int access_time_ms;
int i;
if (argc != 3) {
diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
index 4a76e37ef16b..bd6c22d96ead 100755
--- a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
+++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
@@ -12,7 +12,7 @@ then
exit 1
fi
-if echo foo > "$DBGFS/mk_contexts"
+if echo foo > "$DBGFS/mk_contexts" 2> /dev/null
then
echo "duplicate context creation success"
exit 1
diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
index a6fe0689f88d..53e69a669668 100644
--- a/tools/testing/selftests/damon/huge_count_read_write.c
+++ b/tools/testing/selftests/damon/huge_count_read_write.c
@@ -18,7 +18,7 @@
void write_read_with_huge_count(char *file)
{
int filedesc = open(file, O_RDWR);
- char buf[25];
+ char buf[256];
int ret;
printf("%s %s\n", __func__, file);
@@ -28,9 +28,7 @@ void write_read_with_huge_count(char *file)
}
write(filedesc, "", 0xfffffffful);
- perror("after write: ");
ret = read(filedesc, buf, 0xfffffffful);
- perror("after read: ");
close(filedesc);
}
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index da030b43e43b..8f01f4da1c0d 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -51,3 +51,7 @@ hugetlb_madv_vs_map
mseal_test
seal_elf
droppable
+hugetlb_dio
+pkey_sighandler_tests_32
+pkey_sighandler_tests_64
+guard-pages
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 321e63955272..3de23ea4663f 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -90,6 +90,7 @@ TEST_GEN_FILES += hugetlb_fault_after_madv
TEST_GEN_FILES += hugetlb_madv_vs_map
TEST_GEN_FILES += hugetlb_dio
TEST_GEN_FILES += droppable
+TEST_GEN_FILES += guard-pages
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty
@@ -126,7 +127,9 @@ endif
ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
TEST_GEN_FILES += va_high_addr_switch
+ifneq ($(ARCH),riscv64)
TEST_GEN_FILES += virtual_address_range
+endif
TEST_GEN_FILES += write_to_hugetlbfs
endif
diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c
new file mode 100644
index 000000000000..7cdf815d0d63
--- /dev/null
+++ b/tools/testing/selftests/mm/guard-pages.c
@@ -0,0 +1,1243 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <asm-generic/mman.h> /* Force the import of the tools version. */
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+/*
+ * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
+ *
+ * "If the signal occurs other than as the result of calling the abort or raise
+ * function, the behavior is undefined if the signal handler refers to any
+ * object with static storage duration other than by assigning a value to an
+ * object declared as volatile sig_atomic_t"
+ */
+static volatile sig_atomic_t signal_jump_set;
+static sigjmp_buf signal_jmp_buf;
+
+/*
+ * Ignore the checkpatch warning, we must read from x but don't want to do
+ * anything with it in order to trigger a read page fault. We therefore must use
+ * volatile to stop the compiler from optimising this away.
+ */
+#define FORCE_READ(x) (*(volatile typeof(x) *)x)
+
+static int userfaultfd(int flags)
+{
+ return syscall(SYS_userfaultfd, flags);
+}
+
+static void handle_fatal(int c)
+{
+ if (!signal_jump_set)
+ return;
+
+ siglongjmp(signal_jmp_buf, c);
+}
+
+static int pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(SYS_pidfd_open, pid, flags);
+}
+
+/*
+ * Enable our signal catcher and try to read/write the specified buffer. The
+ * return value indicates whether the read/write succeeds without a fatal
+ * signal.
+ */
+static bool try_access_buf(char *ptr, bool write)
+{
+ bool failed;
+
+ /* Tell signal handler to jump back here on fatal signal. */
+ signal_jump_set = true;
+ /* If a fatal signal arose, we will jump back here and failed is set. */
+ failed = sigsetjmp(signal_jmp_buf, 0) != 0;
+
+ if (!failed) {
+ if (write)
+ *ptr = 'x';
+ else
+ FORCE_READ(ptr);
+ }
+
+ signal_jump_set = false;
+ return !failed;
+}
+
+/* Try and read from a buffer, return true if no fatal signal. */
+static bool try_read_buf(char *ptr)
+{
+ return try_access_buf(ptr, false);
+}
+
+/* Try and write to a buffer, return true if no fatal signal. */
+static bool try_write_buf(char *ptr)
+{
+ return try_access_buf(ptr, true);
+}
+
+/*
+ * Try and BOTH read from AND write to a buffer, return true if BOTH operations
+ * succeed.
+ */
+static bool try_read_write_buf(char *ptr)
+{
+ return try_read_buf(ptr) && try_write_buf(ptr);
+}
+
+FIXTURE(guard_pages)
+{
+ unsigned long page_size;
+};
+
+FIXTURE_SETUP(guard_pages)
+{
+ struct sigaction act = {
+ .sa_handler = &handle_fatal,
+ .sa_flags = SA_NODEFER,
+ };
+
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGSEGV, &act, NULL))
+ ksft_exit_fail_perror("sigaction");
+
+ self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+};
+
+FIXTURE_TEARDOWN(guard_pages)
+{
+ struct sigaction act = {
+ .sa_handler = SIG_DFL,
+ .sa_flags = SA_NODEFER,
+ };
+
+ sigemptyset(&act.sa_mask);
+ sigaction(SIGSEGV, &act, NULL);
+}
+
+TEST_F(guard_pages, basic)
+{
+ const unsigned long NUM_PAGES = 10;
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap(NULL, NUM_PAGES * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Trivially assert we can touch the first page. */
+ ASSERT_TRUE(try_read_write_buf(ptr));
+
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Establish that 1st page SIGSEGV's. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+
+ /* Ensure we can touch everything else.*/
+ for (i = 1; i < NUM_PAGES; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Establish a guard page at the end of the mapping. */
+ ASSERT_EQ(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /* Check that both guard pages result in SIGSEGV. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size]));
+
+ /* Remove the first guard page. */
+ ASSERT_FALSE(madvise(ptr, page_size, MADV_GUARD_REMOVE));
+
+ /* Make sure we can touch it. */
+ ASSERT_TRUE(try_read_write_buf(ptr));
+
+ /* Remove the last guard page. */
+ ASSERT_FALSE(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size,
+ MADV_GUARD_REMOVE));
+
+ /* Make sure we can touch it. */
+ ASSERT_TRUE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size]));
+
+ /*
+ * Test setting a _range_ of pages, namely the first 3. The first of
+ * these be faulted in, so this also tests that we can install guard
+ * pages over backed pages.
+ */
+ ASSERT_EQ(madvise(ptr, 3 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure they are all guard pages. */
+ for (i = 0; i < 3; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Make sure the rest are not. */
+ for (i = 3; i < NUM_PAGES; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Remove guard pages. */
+ ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Now make sure we can touch everything. */
+ for (i = 0; i < NUM_PAGES; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /*
+ * Now remove all guard pages, make sure we don't remove existing
+ * entries.
+ */
+ ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0);
+
+ for (i = 0; i < NUM_PAGES * page_size; i += page_size) {
+ char chr = ptr[i];
+
+ ASSERT_EQ(chr, 'x');
+ }
+
+ ASSERT_EQ(munmap(ptr, NUM_PAGES * page_size), 0);
+}
+
+/* Assert that operations applied across multiple VMAs work as expected. */
+TEST_F(guard_pages, multi_vma)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr_region, *ptr, *ptr1, *ptr2, *ptr3;
+ int i;
+
+ /* Reserve a 100 page region over which we can install VMAs. */
+ ptr_region = mmap(NULL, 100 * page_size, PROT_NONE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_region, MAP_FAILED);
+
+ /* Place a VMA of 10 pages size at the start of the region. */
+ ptr1 = mmap(ptr_region, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+
+ /* Place a VMA of 5 pages size 50 pages into the region. */
+ ptr2 = mmap(&ptr_region[50 * page_size], 5 * page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ /* Place a VMA of 20 pages size at the end of the region. */
+ ptr3 = mmap(&ptr_region[80 * page_size], 20 * page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr3, MAP_FAILED);
+
+ /* Unmap gaps. */
+ ASSERT_EQ(munmap(&ptr_region[10 * page_size], 40 * page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[55 * page_size], 25 * page_size), 0);
+
+ /*
+ * We end up with VMAs like this:
+ *
+ * 0 10 .. 50 55 .. 80 100
+ * [---] [---] [---]
+ */
+
+ /*
+ * Now mark the whole range as guard pages and make sure all VMAs are as
+ * such.
+ */
+
+ /*
+ * madvise() is certifiable and lets you perform operations over gaps,
+ * everything works, but it indicates an error and errno is set to
+ * -ENOMEM. Also if anything runs out of memory it is set to
+ * -ENOMEM. You are meant to guess which is which.
+ */
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), -1);
+ ASSERT_EQ(errno, ENOMEM);
+
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr1[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 5; i++) {
+ char *curr = &ptr2[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 20; i++) {
+ char *curr = &ptr3[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now remove guar pages over range and assert the opposite. */
+
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), -1);
+ ASSERT_EQ(errno, ENOMEM);
+
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr1[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 5; i++) {
+ char *curr = &ptr2[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 20; i++) {
+ char *curr = &ptr3[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Now map incompatible VMAs in the gaps. */
+ ptr = mmap(&ptr_region[10 * page_size], 40 * page_size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ ptr = mmap(&ptr_region[55 * page_size], 25 * page_size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * We end up with VMAs like this:
+ *
+ * 0 10 .. 50 55 .. 80 100
+ * [---][xxxx][---][xxxx][---]
+ *
+ * Where 'x' signifies VMAs that cannot be merged with those adjacent to
+ * them.
+ */
+
+ /* Multiple VMAs adjacent to one another should result in no error. */
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), 0);
+ for (i = 0; i < 100; i++) {
+ char *curr = &ptr_region[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), 0);
+ for (i = 0; i < 100; i++) {
+ char *curr = &ptr_region[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr_region, 100 * page_size), 0);
+}
+
+/*
+ * Assert that batched operations performed using process_madvise() work as
+ * expected.
+ */
+TEST_F(guard_pages, process_madvise)
+{
+ const unsigned long page_size = self->page_size;
+ pid_t pid = getpid();
+ int pidfd = pidfd_open(pid, 0);
+ char *ptr_region, *ptr1, *ptr2, *ptr3;
+ ssize_t count;
+ struct iovec vec[6];
+
+ ASSERT_NE(pidfd, -1);
+
+ /* Reserve region to map over. */
+ ptr_region = mmap(NULL, 100 * page_size, PROT_NONE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_region, MAP_FAILED);
+
+ /*
+ * 10 pages offset 1 page into reserve region. We MAP_POPULATE so we
+ * overwrite existing entries and test this code path against
+ * overwriting existing entries.
+ */
+ ptr1 = mmap(&ptr_region[page_size], 10 * page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE | MAP_POPULATE, -1, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+ /* We want guard markers at start/end of each VMA. */
+ vec[0].iov_base = ptr1;
+ vec[0].iov_len = page_size;
+ vec[1].iov_base = &ptr1[9 * page_size];
+ vec[1].iov_len = page_size;
+
+ /* 5 pages offset 50 pages into reserve region. */
+ ptr2 = mmap(&ptr_region[50 * page_size], 5 * page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+ vec[2].iov_base = ptr2;
+ vec[2].iov_len = page_size;
+ vec[3].iov_base = &ptr2[4 * page_size];
+ vec[3].iov_len = page_size;
+
+ /* 20 pages offset 79 pages into reserve region. */
+ ptr3 = mmap(&ptr_region[79 * page_size], 20 * page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr3, MAP_FAILED);
+ vec[4].iov_base = ptr3;
+ vec[4].iov_len = page_size;
+ vec[5].iov_base = &ptr3[19 * page_size];
+ vec[5].iov_len = page_size;
+
+ /* Free surrounding VMAs. */
+ ASSERT_EQ(munmap(ptr_region, page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[11 * page_size], 39 * page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[55 * page_size], 24 * page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0);
+
+ /* Now guard in one step. */
+ count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0);
+
+ /* OK we don't have permission to do this, skip. */
+ if (count == -1 && errno == EPERM)
+ ksft_exit_skip("No process_madvise() permissions, try running as root.\n");
+
+ /* Returns the number of bytes advised. */
+ ASSERT_EQ(count, 6 * page_size);
+
+ /* Now make sure the guarding was applied. */
+
+ ASSERT_FALSE(try_read_write_buf(ptr1));
+ ASSERT_FALSE(try_read_write_buf(&ptr1[9 * page_size]));
+
+ ASSERT_FALSE(try_read_write_buf(ptr2));
+ ASSERT_FALSE(try_read_write_buf(&ptr2[4 * page_size]));
+
+ ASSERT_FALSE(try_read_write_buf(ptr3));
+ ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size]));
+
+ /* Now do the same with unguard... */
+ count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0);
+
+ /* ...and everything should now succeed. */
+
+ ASSERT_TRUE(try_read_write_buf(ptr1));
+ ASSERT_TRUE(try_read_write_buf(&ptr1[9 * page_size]));
+
+ ASSERT_TRUE(try_read_write_buf(ptr2));
+ ASSERT_TRUE(try_read_write_buf(&ptr2[4 * page_size]));
+
+ ASSERT_TRUE(try_read_write_buf(ptr3));
+ ASSERT_TRUE(try_read_write_buf(&ptr3[19 * page_size]));
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr1, 10 * page_size), 0);
+ ASSERT_EQ(munmap(ptr2, 5 * page_size), 0);
+ ASSERT_EQ(munmap(ptr3, 20 * page_size), 0);
+ close(pidfd);
+}
+
+/* Assert that unmapping ranges does not leave guard markers behind. */
+TEST_F(guard_pages, munmap)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new1, *ptr_new2;
+
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard first and last pages. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[9 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Assert that they are guarded. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[9 * page_size]));
+
+ /* Unmap them. */
+ ASSERT_EQ(munmap(ptr, page_size), 0);
+ ASSERT_EQ(munmap(&ptr[9 * page_size], page_size), 0);
+
+ /* Map over them.*/
+ ptr_new1 = mmap(ptr, page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_new1, MAP_FAILED);
+ ptr_new2 = mmap(&ptr[9 * page_size], page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_new2, MAP_FAILED);
+
+ /* Assert that they are now not guarded. */
+ ASSERT_TRUE(try_read_write_buf(ptr_new1));
+ ASSERT_TRUE(try_read_write_buf(ptr_new2));
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that mprotect() operations have no bearing on guard markers. */
+TEST_F(guard_pages, mprotect)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard the middle of the range. */
+ ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /* Assert that it is indeed guarded. */
+ ASSERT_FALSE(try_read_write_buf(&ptr[5 * page_size]));
+ ASSERT_FALSE(try_read_write_buf(&ptr[6 * page_size]));
+
+ /* Now make these pages read-only. */
+ ASSERT_EQ(mprotect(&ptr[5 * page_size], 2 * page_size, PROT_READ), 0);
+
+ /* Make sure the range is still guarded. */
+ ASSERT_FALSE(try_read_buf(&ptr[5 * page_size]));
+ ASSERT_FALSE(try_read_buf(&ptr[6 * page_size]));
+
+ /* Make sure we can guard again without issue.*/
+ ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the range is, yet again, still guarded. */
+ ASSERT_FALSE(try_read_buf(&ptr[5 * page_size]));
+ ASSERT_FALSE(try_read_buf(&ptr[6 * page_size]));
+
+ /* Now unguard the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Make sure the whole range is readable. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Split and merge VMAs and make sure guard pages still behave. */
+TEST_F(guard_pages, split_merge)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new;
+ int i;
+
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the whole range is guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now unmap some pages in the range so we split. */
+ ASSERT_EQ(munmap(&ptr[2 * page_size], page_size), 0);
+ ASSERT_EQ(munmap(&ptr[5 * page_size], page_size), 0);
+ ASSERT_EQ(munmap(&ptr[8 * page_size], page_size), 0);
+
+ /* Make sure the remaining ranges are guarded post-split. */
+ for (i = 0; i < 2; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ for (i = 2; i < 5; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ for (i = 6; i < 8; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ for (i = 9; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now map them again - the unmap will have cleared the guards. */
+ ptr_new = mmap(&ptr[2 * page_size], page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+ ptr_new = mmap(&ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+ ptr_new = mmap(&ptr[8 * page_size], page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+
+ /* Now make sure guard pages are established. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+ bool expect_true = i == 2 || i == 5 || i == 8;
+
+ ASSERT_TRUE(expect_true ? result : !result);
+ }
+
+ /* Now guard everything again. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the whole range is guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now split the range into three. */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Make sure the whole range is guarded for read. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_buf(curr));
+ }
+
+ /* Now reset protection bits so we merge the whole thing. */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size,
+ PROT_READ | PROT_WRITE), 0);
+
+ /* Make sure the whole range is still guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Split range into 3 again... */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* ...and unguard the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Make sure the whole range is remedied for read. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_buf(curr));
+ }
+
+ /* Merge them again. */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size,
+ PROT_READ | PROT_WRITE), 0);
+
+ /* Now ensure the merged range is remedied for read/write. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that MADV_DONTNEED does not remove guard markers. */
+TEST_F(guard_pages, dontneed)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Back the whole range. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ *curr = 'y';
+ }
+
+ /* Guard every other page. */
+ for (i = 0; i < 10; i += 2) {
+ char *curr = &ptr[i * page_size];
+ int res = madvise(curr, page_size, MADV_GUARD_INSTALL);
+
+ ASSERT_EQ(res, 0);
+ }
+
+ /* Indicate that we don't need any of the range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_DONTNEED), 0);
+
+ /* Check to ensure guard markers are still in place. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_buf(curr);
+
+ if (i % 2 == 0) {
+ ASSERT_FALSE(result);
+ } else {
+ ASSERT_TRUE(result);
+ /* Make sure we really did get reset to zero page. */
+ ASSERT_EQ(*curr, '\0');
+ }
+
+ /* Now write... */
+ result = try_write_buf(&ptr[i * page_size]);
+
+ /* ...and make sure same result. */
+ ASSERT_TRUE(i % 2 != 0 ? result : !result);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that mlock()'ed pages work correctly with guard markers. */
+TEST_F(guard_pages, mlock)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Populate. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ *curr = 'y';
+ }
+
+ /* Lock. */
+ ASSERT_EQ(mlock(ptr, 10 * page_size), 0);
+
+ /* Now try to guard, should fail with EINVAL. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* OK unlock. */
+ ASSERT_EQ(munlock(ptr, 10 * page_size), 0);
+
+ /* Guard first half of range, should now succeed. */
+ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure guard works. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ if (i < 5) {
+ ASSERT_FALSE(result);
+ } else {
+ ASSERT_TRUE(result);
+ ASSERT_EQ(*curr, 'x');
+ }
+ }
+
+ /*
+ * Now lock the latter part of the range. We can't lock the guard pages,
+ * as this would result in the pages being populated and the guarding
+ * would cause this to error out.
+ */
+ ASSERT_EQ(mlock(&ptr[5 * page_size], 5 * page_size), 0);
+
+ /*
+ * Now remove guard pages, we permit mlock()'d ranges to have guard
+ * pages removed as it is a non-destructive operation.
+ */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Now check that no guard pages remain. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * - Moving a mapping alone should retain markers as they are.
+ */
+TEST_F(guard_pages, mremap_move)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new;
+
+ /* Map 5 pages. */
+ ptr = mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Place guard markers at both ends of the 5 page span. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the guard pages are in effect. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Map a new region we will move this range into. Doing this ensures
+ * that we have reserved a range to map into.
+ */
+ ptr_new = mmap(NULL, 5 * page_size, PROT_NONE, MAP_ANON | MAP_PRIVATE,
+ -1, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+
+ ASSERT_EQ(mremap(ptr, 5 * page_size, 5 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new), ptr_new);
+
+ /* Make sure the guard markers are retained. */
+ ASSERT_FALSE(try_read_write_buf(ptr_new));
+ ASSERT_FALSE(try_read_write_buf(&ptr_new[4 * page_size]));
+
+ /*
+ * Clean up - we only need reference the new pointer as we overwrote the
+ * PROT_NONE range and moved the existing one.
+ */
+ munmap(ptr_new, 5 * page_size);
+}
+
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * Expanding should retain guard pages, only now in different position. The user
+ * will have to remove guard pages manually to fix up (they'd have to do the
+ * same if it were a PROT_NONE mapping).
+ */
+TEST_F(guard_pages, mremap_expand)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new;
+
+ /* Map 10 pages... */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /* ...But unmap the last 5 so we can ensure we can expand into them. */
+ ASSERT_EQ(munmap(&ptr[5 * page_size], 5 * page_size), 0);
+
+ /* Place guard markers at both ends of the 5 page span. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the guarding is in effect. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Now expand to 10 pages. */
+ ptr = mremap(ptr, 5 * page_size, 10 * page_size, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Make sure the guard markers are retained in their original positions.
+ */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Reserve a region which we can move to and expand into. */
+ ptr_new = mmap(NULL, 20 * page_size, PROT_NONE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+
+ /* Now move and expand into it. */
+ ptr = mremap(ptr, 10 * page_size, 20 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new);
+ ASSERT_EQ(ptr, ptr_new);
+
+ /*
+ * Again, make sure the guard markers are retained in their original positions.
+ */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /*
+ * A real user would have to remove guard markers, but would reasonably
+ * expect all characteristics of the mapping to be retained, including
+ * guard markers.
+ */
+
+ /* Cleanup. */
+ munmap(ptr, 20 * page_size);
+}
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * Shrinking will result in markers that are shrunk over being removed. Again,
+ * if the user were using a PROT_NONE mapping they'd have to manually fix this
+ * up also so this is OK.
+ */
+TEST_F(guard_pages, mremap_shrink)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ /* Map 5 pages. */
+ ptr = mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Place guard markers at both ends of the 5 page span. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the guarding is in effect. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Now shrink to 3 pages. */
+ ptr = mremap(ptr, 5 * page_size, 3 * page_size, MREMAP_MAYMOVE);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* We expect the guard marker at the start to be retained... */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+
+ /* ...But remaining pages will not have guard markers. */
+ for (i = 1; i < 3; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /*
+ * As with expansion, a real user would have to remove guard pages and
+ * fixup. But you'd have to do similar manual things with PROT_NONE
+ * mappings too.
+ */
+
+ /*
+ * If we expand back to the original size, the end marker will, of
+ * course, no longer be present.
+ */
+ ptr = mremap(ptr, 3 * page_size, 5 * page_size, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Again, we expect the guard marker at the start to be retained... */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+
+ /* ...But remaining pages will not have guard markers. */
+ for (i = 1; i < 5; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ munmap(ptr, 5 * page_size);
+}
+
+/*
+ * Assert that forking a process with VMAs that do not have VM_WIPEONFORK set
+ * retain guard pages.
+ */
+TEST_F(guard_pages, fork)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ pid_t pid;
+ int i;
+
+ /* Map 10 pages. */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Establish guard apges in the first 5 pages. */
+ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+ if (!pid) {
+ /* This is the child process now. */
+
+ /* Assert that the guarding is in effect. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ ASSERT_TRUE(i >= 5 ? result : !result);
+ }
+
+ /* Now unguard the range.*/
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ exit(0);
+ }
+
+ /* Parent process. */
+
+ /* Parent simply waits on child. */
+ waitpid(pid, NULL, 0);
+
+ /* Child unguard does not impact parent page table state. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ ASSERT_TRUE(i >= 5 ? result : !result);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that forking a process with VMAs that do have VM_WIPEONFORK set
+ * behave as expected.
+ */
+TEST_F(guard_pages, fork_wipeonfork)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ pid_t pid;
+ int i;
+
+ /* Map 10 pages. */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Mark wipe on fork. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_WIPEONFORK), 0);
+
+ /* Guard the first 5 pages. */
+ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+ if (!pid) {
+ /* This is the child process now. */
+
+ /* Guard will have been wiped. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ exit(0);
+ }
+
+ /* Parent process. */
+
+ waitpid(pid, NULL, 0);
+
+ /* Guard markers should be in effect.*/
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ ASSERT_TRUE(i >= 5 ? result : !result);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_FREE retains guard entries as expected. */
+TEST_F(guard_pages, lazyfree)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ /* Map 10 pages. */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Ensure guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Lazyfree range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_FREE), 0);
+
+ /* This should leave the guard markers in place. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_POPULATE_READ, MADV_POPULATE_WRITE behave as expected. */
+TEST_F(guard_pages, populate)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+
+ /* Map 10 pages. */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Populate read should error out... */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_READ), -1);
+ ASSERT_EQ(errno, EFAULT);
+
+ /* ...as should populate write. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_WRITE), -1);
+ ASSERT_EQ(errno, EFAULT);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_COLD, MADV_PAGEOUT do not remove guard markers. */
+TEST_F(guard_pages, cold_pageout)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ /* Map 10 pages. */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Ensured guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now mark cold. This should have no impact on guard markers. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_COLD), 0);
+
+ /* Should remain guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* OK, now page out. This should equally, have no effect on markers. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_PAGEOUT), 0);
+
+ /* Should remain guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that guard pages do not break userfaultd. */
+TEST_F(guard_pages, uffd)
+{
+ const unsigned long page_size = self->page_size;
+ int uffd;
+ char *ptr;
+ int i;
+ struct uffdio_api api = {
+ .api = UFFD_API,
+ .features = 0,
+ };
+ struct uffdio_register reg;
+ struct uffdio_range range;
+
+ /* Set up uffd. */
+ uffd = userfaultfd(0);
+ if (uffd == -1 && errno == EPERM)
+ ksft_exit_skip("No userfaultfd permissions, try running as root.\n");
+ ASSERT_NE(uffd, -1);
+
+ ASSERT_EQ(ioctl(uffd, UFFDIO_API, &api), 0);
+
+ /* Map 10 pages. */
+ ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Register the range with uffd. */
+ range.start = (unsigned long)ptr;
+ range.len = 10 * page_size;
+ reg.range = range;
+ reg.mode = UFFDIO_REGISTER_MODE_MISSING;
+ ASSERT_EQ(ioctl(uffd, UFFDIO_REGISTER, &reg), 0);
+
+ /* Guard the range. This should not trigger the uffd. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* The guarding should behave as usual with no uffd intervention. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(ioctl(uffd, UFFDIO_UNREGISTER, &range), 0);
+ close(uffd);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
index 73b81c632366..e2640529dbb2 100644
--- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
+++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
@@ -5,20 +5,36 @@
#include <sys/mman.h>
#include <sys/types.h>
#include <unistd.h>
+#include <setjmp.h>
+#include <signal.h>
#include "vm_util.h"
#include "../kselftest.h"
-#define MMAP_SIZE (1 << 21)
#define INLOOP_ITER 100
-char *huge_ptr;
+static char *huge_ptr;
+static size_t huge_page_size;
+
+static sigjmp_buf sigbuf;
+static bool sigbus_triggered;
+
+static void signal_handler(int signal)
+{
+ if (signal == SIGBUS) {
+ sigbus_triggered = true;
+ siglongjmp(sigbuf, 1);
+ }
+}
/* Touch the memory while it is being madvised() */
void *touch(void *unused)
{
char *ptr = (char *)huge_ptr;
+ if (sigsetjmp(sigbuf, 1))
+ return NULL;
+
for (int i = 0; i < INLOOP_ITER; i++)
ptr[0] = '.';
@@ -30,7 +46,7 @@ void *madv(void *unused)
usleep(rand() % 10);
for (int i = 0; i < INLOOP_ITER; i++)
- madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED);
+ madvise(huge_ptr, huge_page_size, MADV_DONTNEED);
return NULL;
}
@@ -44,9 +60,23 @@ int main(void)
* interactions
*/
int max = 10000;
+ int err;
+
+ ksft_print_header();
+ ksft_set_plan(1);
srand(getpid());
+ if (signal(SIGBUS, signal_handler) == SIG_ERR)
+ ksft_exit_skip("Could not register signal handler.");
+
+ huge_page_size = default_huge_page_size();
+ if (!huge_page_size)
+ ksft_exit_skip("Could not detect default hugetlb page size.");
+
+ ksft_print_msg("[INFO] detected default hugetlb page size: %zu KiB\n",
+ huge_page_size / 1024);
+
free_hugepages = get_free_hugepages();
if (free_hugepages != 1) {
ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
@@ -54,7 +84,7 @@ int main(void)
}
while (max--) {
- huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ huge_ptr = mmap(NULL, huge_page_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-1, 0);
@@ -66,8 +96,14 @@ int main(void)
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
- munmap(huge_ptr, MMAP_SIZE);
+ munmap(huge_ptr, huge_page_size);
}
- return KSFT_PASS;
+ ksft_test_result(!sigbus_triggered, "SIGBUS behavior\n");
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ ksft_exit_pass();
}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 2c5394584af4..2fc290d9430c 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -349,10 +349,12 @@ if [ $VADDR64 -ne 0 ]; then
# allows high virtual address allocation requests independent
# of platform's physical memory.
- prev_policy=$(cat /proc/sys/vm/overcommit_memory)
- echo 1 > /proc/sys/vm/overcommit_memory
- CATEGORY="hugevm" run_test ./virtual_address_range
- echo $prev_policy > /proc/sys/vm/overcommit_memory
+ if [ -x ./virtual_address_range ]; then
+ prev_policy=$(cat /proc/sys/vm/overcommit_memory)
+ echo 1 > /proc/sys/vm/overcommit_memory
+ CATEGORY="hugevm" run_test ./virtual_address_range
+ echo $prev_policy > /proc/sys/vm/overcommit_memory
+ fi
# va high address boundary switch test
ARCH_ARM64="arm64"
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index 4e4c1e311247..2a2b69e91950 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -64,7 +64,7 @@
#define NR_CHUNKS_HIGH NR_CHUNKS_384TB
#endif
-static char *hind_addr(void)
+static char *hint_addr(void)
{
int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
@@ -185,7 +185,7 @@ int main(int argc, char *argv[])
}
for (i = 0; i < NR_CHUNKS_HIGH; i++) {
- hint = hind_addr();
+ hint = hint_addr();
hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk
index a6bc51d0b0bf..923ee2492256 100644
--- a/tools/testing/shared/shared.mk
+++ b/tools/testing/shared/shared.mk
@@ -69,6 +69,7 @@ generated/bit-length.h: FORCE
@if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \
echo "Generating $@"; \
echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \
+ echo "#define CONFIG_PHYS_ADDR_T_$(LONG_BIT)BIT 1" >> $@; \
fi
FORCE: ;
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index b33b47342d41..8fab5e13c7c3 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -4,6 +4,8 @@
#include <stdio.h>
#include <stdlib.h>
+#include "generated/bit-length.h"
+
#include "maple-shared.h"
#include "vma_internal.h"
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index c5b9da034511..e76ff579e1fd 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -44,7 +44,9 @@
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000
#define VM_DONTEXPAND 0x00040000
+#define VM_LOCKONFAULT 0x00080000
#define VM_ACCOUNT 0x00100000
+#define VM_NORESERVE 0x00200000
#define VM_MIXEDMAP 0x10000000
#define VM_STACK VM_GROWSDOWN
#define VM_SHADOW_STACK VM_NONE
@@ -53,6 +55,14 @@
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
+
+#ifdef CONFIG_64BIT
+/* VM is sealed, in vm_flags */
+#define VM_SEALED _BITUL(63)
+#endif
+
#define FIRST_USER_ADDRESS 0UL
#define USER_PGTABLES_CEILING 0UL
@@ -698,8 +708,9 @@ static inline void tlb_finish_mmu(struct mmu_gather *)
{
}
-static inline void get_file(struct file *)
+static inline struct file *get_file(struct file *f)
{
+ return f;
}
static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *)
@@ -920,4 +931,106 @@ static inline bool signal_pending(void *)
return false;
}
+static inline bool is_file_hugepages(struct file *)
+{
+ return false;
+}
+
+static inline int security_vm_enough_memory_mm(struct mm_struct *, long)
+{
+ return true;
+}
+
+static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long)
+{
+ return true;
+}
+
+static inline void vm_flags_init(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma->__vm_flags = flags;
+}
+
+static inline void vm_flags_set(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_start_write(vma);
+ vma->__vm_flags |= flags;
+}
+
+static inline void vm_flags_clear(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_start_write(vma);
+ vma->__vm_flags &= ~flags;
+}
+
+static inline int call_mmap(struct file *, struct vm_area_struct *)
+{
+ return 0;
+}
+
+static inline int shmem_zero_setup(struct vm_area_struct *)
+{
+ return 0;
+}
+
+static inline void vma_set_anonymous(struct vm_area_struct *vma)
+{
+ vma->vm_ops = NULL;
+}
+
+static inline void ksm_add_vma(struct vm_area_struct *)
+{
+}
+
+static inline void perf_event_mmap(struct vm_area_struct *)
+{
+}
+
+static inline bool vma_is_dax(struct vm_area_struct *)
+{
+ return false;
+}
+
+static inline struct vm_area_struct *get_gate_vma(struct mm_struct *)
+{
+ return NULL;
+}
+
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+static inline void vma_set_page_prot(struct vm_area_struct *vma)
+{
+ unsigned long vm_flags = vma->vm_flags;
+ pgprot_t vm_page_prot;
+
+ /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+ vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
+
+ if (vma_wants_writenotify(vma, vm_page_prot)) {
+ vm_flags &= ~VM_SHARED;
+ /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+ vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
+ }
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
+ WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
+}
+
+static inline bool arch_validate_flags(unsigned long)
+{
+ return true;
+}
+
+static inline void vma_close(struct vm_area_struct *)
+{
+}
+
+static inline int mmap_file(struct file *, struct vm_area_struct *)
+{
+ return 0;
+}
+
#endif /* __MM_VMA_INTERNAL_H */